{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,18]],"date-time":"2026-02-18T00:09:03Z","timestamp":1771373343741,"version":"3.50.1"},"reference-count":223,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2023,9,21]],"date-time":"2023-09-21T00:00:00Z","timestamp":1695254400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,9,21]],"date-time":"2023-09-21T00:00:00Z","timestamp":1695254400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62002257"],"award-info":[{"award-number":["62002257"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2024"],"award-info":[{"award-number":["U21B2024"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2021M692395"],"award-info":[{"award-number":["2021M692395"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFF0901600"],"award-info":[{"award-number":["2021YFF0901600"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s00530-023-01175-x","type":"journal-article","created":{"date-parts":[[2023,9,21]],"date-time":"2023-09-21T16:02:26Z","timestamp":1695312146000},"page":"3781-3804","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["A comprehensive survey on deep-learning-based visual captioning"],"prefix":"10.1007","volume":"29","author":[{"given":"Bowen","family":"Xin","sequence":"first","affiliation":[]},{"given":"Ning","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Yingchen","family":"Zhai","sequence":"additional","affiliation":[]},{"given":"Tingting","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Zimu","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Weizhi","family":"Nie","sequence":"additional","affiliation":[]},{"given":"Xuanya","family":"Li","sequence":"additional","affiliation":[]},{"given":"An-An","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,9,21]]},"reference":[{"key":"1175_CR1","doi-asserted-by":"crossref","unstructured":"Aafaq, N., Akhtar, N., Liu, W., et\u00a0al.: Spatio-temporal dynamics and semantic attribute enriched visual encoding for video captioning. In: CVPR, pp. 12,487\u201312,496 (2019)","DOI":"10.1109\/CVPR.2019.01277"},{"key":"1175_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., et\u00a0al.: SPICE: semantic propositional image caption evaluation. In: ECCV, pp. 382\u2013398 (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1175_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., et\u00a0al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"1175_CR4","doi-asserted-by":"crossref","unstructured":"Aneja, J., Agrawal, H., Batra, D., et\u00a0al.: Sequential latent spaces for modeling the intention during diverse image captioning. In: ICCV, pp. 4260\u20134269 (2019)","DOI":"10.1109\/ICCV.2019.00436"},{"key":"1175_CR5","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: ICLR (2015)"},{"key":"1175_CR6","doi-asserted-by":"crossref","unstructured":"Baraldi, L., Grana, C., Cucchiara, R.: Hierarchical boundary-aware neural encoder for video captioning. In: CVPR, pp. 3185\u20133194 (2017)","DOI":"10.1109\/CVPR.2017.339"},{"key":"1175_CR7","doi-asserted-by":"crossref","unstructured":"Barati, E., Chen, X.: Critic-based attention network for event-based video captioning. In: ACMMM, pp. 811\u2013817 (2019)","DOI":"10.1145\/3343031.3351037"},{"issue":"5","key":"1175_CR8","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TSMC.1983.6313077","volume":"13","author":"AG Barto","year":"1983","unstructured":"Barto, A.G., Sutton, R.S., Anderson, C.W.: Neuronlike adaptive elements that can solve difficult learning control problems. IEEE Trans. Syst. Man Cybern. 13(5), 834\u2013846 (1983)","journal-title":"IEEE Trans. Syst. Man Cybern."},{"key":"1175_CR9","unstructured":"Bengio, Y., Ducharme, R., Vincent, P.: A neural probabilistic language model. In: NIPS, pp. 932\u2013938 (2000)"},{"key":"1175_CR10","unstructured":"Bengio, S., Vinyals, O., Jaitly, N., et\u00a0al.: Scheduled sampling for sequence prediction with recurrent neural networks. In: NIPS, pp. 1171\u20131179 (2015)"},{"key":"1175_CR11","doi-asserted-by":"crossref","unstructured":"Biten, A.F., G\u00f3mez, L., Rusi\u00f1ol, M., et\u00a0al.: Good news, everyone! context driven entity-aware captioning for news images. In: CVPR, pp. 12,466\u201312,475 (2019)","DOI":"10.1109\/CVPR.2019.01275"},{"key":"1175_CR12","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., et\u00a0al.: Conceptual 12m: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: CVPR, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"1175_CR13","unstructured":"Chen, D.L., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: ACL, pp. 190\u2013200 (2011)"},{"key":"1175_CR14","doi-asserted-by":"crossref","unstructured":"Chen, F., Ji, R., Sun, X., et\u00a0al.: Groupcap: group-based image captioning with structured relevance and diversity constraints. In: CVPR, pp. 1345\u2013 1353 (2018)","DOI":"10.1109\/CVPR.2018.00146"},{"key":"1175_CR15","doi-asserted-by":"crossref","unstructured":"Chen, L., Jiang, Z., Xiao, J., et\u00a0al.: Human-like controllable image captioning with verb-specific semantic roles. In: CVPR, pp. 16,846\u201316,856 (2021)","DOI":"10.1109\/CVPR46437.2021.01657"},{"key":"1175_CR16","doi-asserted-by":"crossref","unstructured":"Chen, S., Jiang, Y.: Motion guided spatial attention for video captioning. In: AAAI, pp. 8191\u20138198 (2019)","DOI":"10.1609\/aaai.v33i01.33018191"},{"key":"1175_CR17","doi-asserted-by":"crossref","unstructured":"Chen, S., Jin, Q., Wang, P., et\u00a0al.: Say as you wish: fine-grained control of image caption generation with abstract scene graphs. In: CVPR, pp. 9959\u20139968 (2020)","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"1175_CR18","doi-asserted-by":"crossref","unstructured":"Chen, X., Ma, L., Jiang, W., et\u00a0al.: Regularizing RNNs for caption generation by reconstructing the past with the present. In: CVPR, pp. 7995\u20138003 (2018)","DOI":"10.1109\/CVPR.2018.00834"},{"key":"1175_CR19","doi-asserted-by":"crossref","unstructured":"Chen, X., Song, J., Zeng, P., et\u00a0al.: Support-set based multi-modal representation enhancement for video captioning. In: IEEE International Conference on Multimedia and Expo, pp. 1\u20136 (2022)","DOI":"10.1109\/ICME52920.2022.9859943"},{"key":"1175_CR20","doi-asserted-by":"crossref","unstructured":"Chen, Y., Wang, S., Zhang, W., et\u00a0al.: Less is more: picking informative frames for video captioning. In: ECCV, pp. 367\u2013384 (2018d)","DOI":"10.1007\/978-3-030-01261-8_22"},{"key":"1175_CR21","doi-asserted-by":"crossref","unstructured":"Chen, L., Zhang, H., Xiao, J., et\u00a0al.: Counterfactual critic multi-agent training for scene graph generation. In: ICCV, pp. 4612\u20134622 (2019)","DOI":"10.1109\/ICCV.2019.00471"},{"key":"1175_CR22","doi-asserted-by":"crossref","unstructured":"Chen, T., Zhang, Z., You, Q., et\u00a0al.: \"factual\" or \"emotional\": stylized image captioning with adaptive learning and attention. In: ECCV, pp. 527\u2013543 (2018)","DOI":"10.1007\/978-3-030-01249-6_32"},{"issue":"9","key":"1175_CR23","doi-asserted-by":"publisher","first-page":"2407","DOI":"10.1109\/TMM.2019.2896515","volume":"21","author":"S Chen","year":"2019","unstructured":"Chen, S., Jin, Q., Chen, J., et al.: Generating video descriptions with latent topic guidance. IEEE Trans. Multimed. 21(9), 2407\u20132418 (2019)","journal-title":"IEEE Trans. Multimed."},{"key":"1175_CR24","doi-asserted-by":"crossref","unstructured":"Cho, K., van Merrienboer, B., G\u00fcl\u00e7ehre, \u00c7., et\u00a0al.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. In: EMNLP, pp. 1724\u20131734 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"issue":"11","key":"1175_CR25","doi-asserted-by":"publisher","first-page":"1875","DOI":"10.1109\/TMM.2015.2477044","volume":"17","author":"K Cho","year":"2015","unstructured":"Cho, K., Courville, A.C., Bengio, Y.: Describing multimedia content using attention-based encoder-decoder networks. IEEE Trans. Multimed. 17(11), 1875\u20131886 (2015)","journal-title":"IEEE Trans. Multimed."},{"key":"1175_CR26","doi-asserted-by":"crossref","unstructured":"Cornia, M., Baraldi, L., Cucchiara, R.: Show, control and tell: a framework for generating controllable and grounded captions. In: CVPR, pp. 8307\u20138316 (2019)","DOI":"10.1109\/CVPR.2019.00850"},{"key":"1175_CR27","unstructured":"Cornia, M., Baraldi, L., Fiameni, G., et\u00a0al .: Universal captioner: long-tail vision-and-language model training through content-style separation. CoRR. arXiv: abs\/2111.12727 (2021)"},{"key":"1175_CR28","doi-asserted-by":"crossref","unstructured":"Dai, B., Fidler, S., Urtasun, R., et\u00a0al.: Towards diverse and natural image descriptions via a conditional GAN. In: ICCV, pp. 2989\u20132998 (2017)","DOI":"10.1109\/ICCV.2017.323"},{"key":"1175_CR29","unstructured":"Dai, B., Lin, D.: Contrastive learning for image captioning. In: NIPS, pp. 898\u2013907 (2017)"},{"key":"1175_CR30","doi-asserted-by":"crossref","unstructured":"Deng, C., Ding, N., Tan, M., et\u00a0al.: Length-controllable image captioning. In: ECCV, pp. 712\u2013729 (2020)","DOI":"10.1007\/978-3-030-58601-0_42"},{"key":"1175_CR31","doi-asserted-by":"crossref","unstructured":"Denkowski, M.J., Lavie, A.: Meteor universal: Language specific translation evaluation for any target language. In: WMT@ACL, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"1175_CR32","doi-asserted-by":"crossref","unstructured":"Deshpande, A., Aneja, J., Wang, L., et\u00a0al.: Fast, diverse and accurate image captioning guided by part-of-speech. In: CVPR, pp. 10,695\u201310,704 (2019)","DOI":"10.1109\/CVPR.2019.01095"},{"key":"1175_CR33","unstructured":"Devlin, J., Chang, M., Lee, K., et\u00a0al.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"1175_CR34","doi-asserted-by":"crossref","unstructured":"Dognin, P.L., Melnyk, I., Mroueh, Y., et\u00a0al.: Adversarial semantic alignment for improved image captions. In: CVPR, pp. 10,463\u201310,471 (2019)","DOI":"10.1109\/CVPR.2019.01071"},{"issue":"4","key":"1175_CR35","doi-asserted-by":"publisher","first-page":"677","DOI":"10.1109\/TPAMI.2016.2599174","volume":"39","author":"J Donahue","year":"2017","unstructured":"Donahue, J., Hendricks, L.A., Rohrbach, M., et al.: Long-term recurrent convolutional networks for visual recognition and description. IEEE Trans. Pattern Anal. Mach. Intell. 39(4), 677\u2013691 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1175_CR36","doi-asserted-by":"crossref","unstructured":"Dong, J., Li, X., Lan, W., et\u00a0al.:Early embedding and late reranking for video captioning. In: ACMMM, pp. 1082\u20131086 (2016)","DOI":"10.1145\/2964284.2984064"},{"key":"1175_CR37","unstructured":"Duan, X., Huang, W., Gan, C., et\u00a0al.: Weakly supervised dense event captioning in videos. In: NIPS, pp. 3063\u20133073 (2018)"},{"key":"1175_CR38","unstructured":"Elliott, D., Frank, S., Hasler, E.: Multi-language image description with neural sequence models. CoRR (2015). arXiv: abs\/1510.04709"},{"key":"1175_CR39","unstructured":"Elliott, D., Keller, F.: Image description using visual dependency representations. In: EMNLP, pp. 1292\u20131302 (2013)"},{"key":"1175_CR40","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F.N., et\u00a0al.: From captions to visual concepts and back. In: CVPR, pp. 1473\u20131482 (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"1175_CR41","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Hejrati, S.M.M., Sadeghi, M.A., et\u00a0al.: Every picture tells a story: generating sentences from images. In: ECCV, pp. 15\u201329 (2010)","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"1175_CR42","unstructured":"Fei, Z.: Fast image caption generation with position alignment. CoRR (2019) arXiv: abs\/1912.06365"},{"key":"1175_CR43","doi-asserted-by":"crossref","unstructured":"Fei, Z.: Iterative back modification for faster image captioning. In: MMACM, pp. 3182\u20133190 (2020)","DOI":"10.1145\/3394171.3413901"},{"key":"1175_CR44","doi-asserted-by":"crossref","unstructured":"Feng, Y., Ma, L., Liu, W., et\u00a0al.: Unsupervised image captioning. In: CVPR, pp. 4125\u20134134 (2019)","DOI":"10.1109\/CVPR.2019.00425"},{"issue":"2","key":"1175_CR45","doi-asserted-by":"publisher","first-page":"6","DOI":"10.1109\/MC.2014.42","volume":"47","author":"DA Forsyth","year":"2014","unstructured":"Forsyth, D.A.: Object detection with discriminatively trained part-based models. IEEE Comput. 47(2), 6\u20137 (2014)","journal-title":"IEEE Comput."},{"issue":"12","key":"1175_CR46","doi-asserted-by":"publisher","first-page":"5910","DOI":"10.1109\/TNNLS.2018.2813306","volume":"29","author":"K Fu","year":"2018","unstructured":"Fu, K., Li, J., Jin, J., et al.: Image-text surgery: efficient concept learning in image captioning by generating pseudopairs. IEEE Trans. Neural Netw. Learn. Syst. 29(12), 5910\u20135921 (2018)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"1175_CR47","doi-asserted-by":"crossref","unstructured":"Gan, Z., Gan, C., He, X., et\u00a0al.: Semantic compositional networks for visual captioning. In: CVPR, pp. 1141\u20131150 (2017)","DOI":"10.1109\/CVPR.2017.127"},{"key":"1175_CR48","doi-asserted-by":"crossref","unstructured":"Gan, C., Yang, T., Gong, B.: Learning attributes equals multi-source domain generalization. In: CVPR, pp. 87\u201397 (2016)","DOI":"10.1109\/CVPR.2016.17"},{"key":"1175_CR49","doi-asserted-by":"crossref","unstructured":"Gao, J., Wang, S., Wang, S., et\u00a0al.: Self-critical n-step training for image captioning. In: CVPR, pp. 6300\u20136308 (2019)","DOI":"10.1109\/CVPR.2019.00646"},{"issue":"9","key":"1175_CR50","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","volume":"19","author":"L Gao","year":"2017","unstructured":"Gao, L., Guo, Z., Zhang, H., et al.: Video captioning with attention-based LSTM and semantic consistency. IEEE Trans. Multimed. 19(9), 2045\u20132055 (2017)","journal-title":"IEEE Trans. Multimed."},{"key":"1175_CR51","doi-asserted-by":"crossref","unstructured":"Gong, Y., Wang, L., Guo, R., et\u00a0al.: Multi-scale orderless pooling of deep convolutional activation features. In: ECCV, pp. 392\u2013407 (2014)","DOI":"10.1007\/978-3-319-10584-0_26"},{"key":"1175_CR52","doi-asserted-by":"crossref","unstructured":"Gong, Y., Wang, L., Hodosh, M., et\u00a0al.: Improving image-sentence embeddings using large weakly annotated photo collections. In: ECCV, pp. 529\u2013545 (2014)","DOI":"10.1007\/978-3-319-10593-2_35"},{"key":"1175_CR53","unstructured":"Goodfellow, I.J., Pouget-Abadie, J., Mirza, M., et\u00a0al.: Generative adversarial nets. In: NeurIPS, pp. 2672\u20132680 (2014)"},{"key":"1175_CR54","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., et\u00a0al.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: CVPR, pp. 6325\u20136334 (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"1175_CR55","doi-asserted-by":"crossref","unstructured":"Gueguen, L., Hamid, R.: Large-scale damage detection using satellite imagery. In: CVPR, pp. 1321\u20131328 (2015)","DOI":"10.1109\/CVPR.2015.7298737"},{"key":"1175_CR56","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Yao, P., et\u00a0al.: Mscap: multi-style image captioning with unpaired stylized text. In: CVPR, pp. 4204\u20134213 (2019)","DOI":"10.1109\/CVPR.2019.00433"},{"key":"1175_CR57","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Zhu, X., et\u00a0al.: Non-autoregressive image captioning with counterfactuals-critical multi-agent learning. In: IJCAI, pp. 767\u2013773 (2020)","DOI":"10.24963\/ijcai.2020\/107"},{"key":"1175_CR58","doi-asserted-by":"crossref","unstructured":"Hendricks, L.A., Venugopalan, S., Rohrbach, M., et\u00a0al.: Deep compositional captioning: Describing novel object categories without paired training data. In: CVPR, pp. 1\u201310 (2016)","DOI":"10.1109\/CVPR.2016.8"},{"key":"1175_CR59","unstructured":"Herdade, S., Kappeler, A., Boakye, K., et\u00a0al.: Image captioning: transforming objects into words. In: NIPS, pp. 11,135\u201311,145 (2019)"},{"issue":"8","key":"1175_CR60","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"1175_CR61","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: data, models and evaluation metrics. J. Artif. Intell. Res. 47, 853\u2013899 (2013)","journal-title":"J. Artif. Intell. Res."},{"key":"1175_CR62","doi-asserted-by":"crossref","unstructured":"Hori, C., Hori, T., Lee, T., et\u00a0al.: Attention-based multimodal fusion for video description. In: ICCV, pp. 4203\u20134212 (2017)","DOI":"10.1109\/ICCV.2017.450"},{"key":"1175_CR63","doi-asserted-by":"crossref","unstructured":"Hou, J., Wu, X., Zhao, W., et\u00a0al.: Joint syntax representation learning and visual cue translation for video captioning. In: ICCV, pp. 8917\u20138926 (2019)","DOI":"10.1109\/ICCV.2019.00901"},{"key":"1175_CR64","doi-asserted-by":"crossref","unstructured":"Hu, A., Chen, S., Jin, Q.: ICECAP: information concentrated entity-aware image captioning. CoRR. arXiv: abs\/2108.02050 (2021)","DOI":"10.1145\/3394171.3413576"},{"key":"1175_CR65","doi-asserted-by":"crossref","unstructured":"Huang, Q., Gan, Z., \u00c7elikyilmaz, A., et\u00a0al .: Hierarchically structured reinforcement learning for topically coherent visual story generation. In: AAAI, pp. 8465\u20138472 (2019b)","DOI":"10.1609\/aaai.v33i01.33018465"},{"key":"1175_CR66","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., et\u00a0al.: Attention on attention for image captioning. In: ICCV, pp. 4633\u20134642 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"1175_CR67","unstructured":"Jang, E., Gu, S., Poole, B.: Categorical reparameterization with gumbel-softmax. In: ICLR (2017)"},{"key":"1175_CR68","doi-asserted-by":"crossref","unstructured":"Jia, X., Gavves, E., Fernando, B., et\u00a0al.: Guiding the long-short term memory model for image caption generation. In: ICCV, pp. 2407\u20132415 (2015)","DOI":"10.1109\/ICCV.2015.277"},{"key":"1175_CR69","doi-asserted-by":"crossref","unstructured":"Jin, Q., Chen, J., Chen, S., et\u00a0al.: Describing videos using multi-modal fusion. In: ACM MM, pp. 1087\u20131091 (2016)","DOI":"10.1145\/2964284.2984065"},{"key":"1175_CR70","unstructured":"Jin, J., Fu, K., Cui, R., et\u00a0al.: Aligning where to see and what to tell: image caption with region-based attention and scene factorization. CoRR. arXiv: abs\/1506.06272 (2015)"},{"key":"1175_CR71","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: Densecap: fully convolutional localization networks for dense captioning. In: CVPR, pp. 4565\u20134574 (2016)","DOI":"10.1109\/CVPR.2016.494"},{"issue":"4","key":"1175_CR72","doi-asserted-by":"publisher","first-page":"664","DOI":"10.1109\/TPAMI.2016.2598339","volume":"39","author":"A Karpathy","year":"2017","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. IEEE Trans. Pattern Anal. Mach. Intell. 39(4), 664\u2013676 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1175_CR73","doi-asserted-by":"crossref","unstructured":"Ke, L., Pei, W., Li, R., et\u00a0al.: Reflective decoding network for image captioning. In: ICCV, pp. 8887\u20138896 (2019)","DOI":"10.1109\/ICCV.2019.00898"},{"key":"1175_CR74","unstructured":"Khan, M.U.G., Gotoh ,Y.: Describing video contents in natural language. In: Proceeding of Workshop Innovative Hybrid Approaches Process. Textual Data, pp. 27\u201335 (2012)"},{"key":"1175_CR75","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Multimodal neural language models. In: ICML, pp. 595\u2013603 (2014)"},{"issue":"1","key":"1175_CR76","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. IJCV 123(1), 32\u201373 (2017)","journal-title":"IJCV"},{"key":"1175_CR77","doi-asserted-by":"crossref","unstructured":"Kulkarni, G., Premraj, V., Dhar, S., et\u00a0al.: Baby talk: understanding and generating simple image descriptions. In: CVPR, pp. 1601\u20131608 (2011)","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"1175_CR78","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1162\/tacl_a_00188","volume":"2","author":"P Kuznetsova","year":"2014","unstructured":"Kuznetsova, P., Ordonez, V., Berg, T.L., et al.: TREETALK: composition and compression of trees for image descriptions. TACL 2, 351\u2013362 (2014)","journal-title":"TACL"},{"key":"1175_CR79","doi-asserted-by":"crossref","unstructured":"Laina, I., Rupprecht, C., Navab, N.: Towards unsupervised image captioning with shared multimodal embeddings. In: ICCV, pp. 7413\u20137423 (2019)","DOI":"10.1109\/ICCV.2019.00751"},{"key":"1175_CR80","doi-asserted-by":"crossref","unstructured":"Lan, W., Li, X., Dong, J.: Fluency-guided cross-lingual image captioning. In: ACM MM, pp. 1549\u20131557 (2017)","DOI":"10.1145\/3123266.3123366"},{"key":"1175_CR81","doi-asserted-by":"crossref","unstructured":"Li, Y., Pan, Y., Yao, T., et\u00a0al.: Comprehending and ordering semantics for image captioning. In: CVPR, pp. 17,969\u201317,978 (2022)","DOI":"10.1109\/CVPR52688.2022.01746"},{"key":"1175_CR82","doi-asserted-by":"crossref","unstructured":"Li, L., Tang, S., Deng, L., et\u00a0al.: Image caption with global-local attention. In: AAAI, pp. 4133\u20134139 (2017)","DOI":"10.1609\/aaai.v31i1.11236"},{"key":"1175_CR83","doi-asserted-by":"crossref","unstructured":"Li, Y., Yao, T., Mei, T., et\u00a0al.: Share-and-chat: achieving human-level video commenting by search and multi-view embedding. In: ACMMM, pp. 928\u2013937 (2016)","DOI":"10.1145\/2964284.2964320"},{"key":"1175_CR84","doi-asserted-by":"crossref","unstructured":"Li, Y., Yao, T., Pan, Y., et\u00a0al.: Pointing novel objects in image captioning. In: CVPR, pp. 12,497\u201312,506 (2019)","DOI":"10.1109\/CVPR.2019.01278"},{"key":"1175_CR85","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., et\u00a0al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: ECCV, pp. 121\u2013137 (2020)","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"1175_CR86","doi-asserted-by":"crossref","unstructured":"Li, G., Zhu, L., Liu, P., et\u00a0al.: Entangled transformer for image captioning. In: ICCV, pp. 8927\u20138936 (2019)","DOI":"10.1109\/ICCV.2019.00902"},{"issue":"8","key":"1175_CR87","doi-asserted-by":"publisher","first-page":"2117","DOI":"10.1109\/TMM.2019.2896516","volume":"21","author":"X Li","year":"2019","unstructured":"Li, X., Jiang, S.: Know more say less: image captioning based on scene graphs. IEEE Trans. Multimed. 21(8), 2117\u20132130 (2019)","journal-title":"IEEE Trans. Multimed."},{"issue":"1","key":"1175_CR88","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1167\/7.1.1","volume":"7","author":"F Li","year":"2007","unstructured":"Li, F., Asha, I., Christof, K., et al.: What do we perceive in a glance of a real-world scene? J. Vis. 7(1), 1\u201329 (2007)","journal-title":"J. Vis."},{"issue":"3","key":"1175_CR89","doi-asserted-by":"publisher","first-page":"726","DOI":"10.1109\/TMM.2017.2751140","volume":"20","author":"L Li","year":"2018","unstructured":"Li, L., Tang, S., Zhang, Y., et al.: GLA: global-local attention for image description. IEEE Trans. Multimed. 20(3), 726\u2013737 (2018)","journal-title":"IEEE Trans. Multimed."},{"issue":"9","key":"1175_CR90","doi-asserted-by":"publisher","first-page":"2347","DOI":"10.1109\/TMM.2019.2896494","volume":"21","author":"X Li","year":"2019","unstructured":"Li, X., Xu, C., Wang, X., et al.: COCO-CN for cross-lingual image tagging, captioning, and retrieval. IEEE Trans. Multimed. 21(9), 2347\u20132360 (2019)","journal-title":"IEEE Trans. Multimed."},{"key":"1175_CR91","doi-asserted-by":"crossref","unstructured":"Liang, X., Hu, Z., Zhang, H., et\u00a0al.: Recurrent topic-transition GAN for visual paragraph generation. In: ICCV, pp. 3382\u20133391 (2017)","DOI":"10.1109\/ICCV.2017.364"},{"key":"1175_CR92","doi-asserted-by":"crossref","unstructured":"Lin, T., Maire, M., Belongie, S.J., et\u00a0al.: Microsoft COCO: common objects in context. In: ECCV, pp. 740\u2013755 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1175_CR93","unstructured":"Lin, C.: Rouge: a package for automatic evaluation of summaries. In: ACL Workshop, pp. 74\u201381 (2004)"},{"key":"1175_CR94","unstructured":"Liu, W., Chen, S., Guo, L., et\u00a0al.: CPTR: full transformer network for image captioning. CoRR. arXiv: abs\/2101.10804 (2021)"},{"key":"1175_CR95","doi-asserted-by":"crossref","unstructured":"Liu, L., Tang, J., Wan, X., et\u00a0al.: Generating diverse and descriptive image captions using visual paraphrases. In: ICCV, pp. 4239\u20134248 (2019)","DOI":"10.1109\/ICCV.2019.00434"},{"key":"1175_CR96","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wang, R., Shan, S, et\u00a0al.: Structure inference net: object detection using scene-level context and instance-level relationships. In: CVPR, pp. 6985\u20136994 (2018)","DOI":"10.1109\/CVPR.2018.00730"},{"key":"1175_CR97","doi-asserted-by":"crossref","unstructured":"Liu, F., Wang, Y., Wang, T., et\u00a0al.: Visual news: benchmark and challenges in news image captioning. In: EMNLP, pp. 6761\u20136771 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.542"},{"key":"1175_CR98","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhu, Z., Ye, N., et\u00a0al.: Improved image captioning via policy gradient optimization of spider. In: ICCV, pp. 873\u2013881 (2017)","DOI":"10.1109\/ICCV.2017.100"},{"key":"1175_CR99","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1016\/j.cviu.2017.04.013","volume":"163","author":"A Liu","year":"2017","unstructured":"Liu, A., Xu, N., Wong, Y., et al.: Hierarchical and multimodal video captioning: discovering and transferring multimodal knowledge for vision to language. Comput. Vis. Image Underst. 163, 113\u2013125 (2017)","journal-title":"Comput. Vis. Image Underst."},{"issue":"2","key":"1175_CR100","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1109\/TIP.2018.2872879","volume":"28","author":"A Liu","year":"2019","unstructured":"Liu, A., Xu, N., Nie, W., et al.: Multi-domain and multi-task learning for human action recognition. IEEE Trans. Image Process. 28(2), 853\u2013867 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"1175_CR101","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1162\/tacl_a_00013","volume":"6","author":"X Long","year":"2018","unstructured":"Long, X., Gan, C., de Melo, G.: Video captioning with multi-faceted attention. TACL 6, 173\u2013184 (2018)","journal-title":"TACL"},{"key":"1175_CR102","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., et\u00a0al.: Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: CVPR, pp. 3242\u20133250 (2017)","DOI":"10.1109\/CVPR.2017.345"},{"key":"1175_CR103","doi-asserted-by":"crossref","unstructured":"Lu, J., Yang, J., Batra, D., et\u00a0al.: Neural baby talk. In: CVPR, pp. 7219\u20137228 (2018)","DOI":"10.1109\/CVPR.2018.00754"},{"key":"1175_CR104","doi-asserted-by":"crossref","unstructured":"Luo, Y., Ji, J., Sun ,X., et\u00a0al.: Dual-level collaborative transformer for image captioning. In: AAAI, pp. 2286\u20132293 (2021)","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"1175_CR105","doi-asserted-by":"crossref","unstructured":"Luo, R., Price, B.L., Cohen, S., et\u00a0al.: Discriminability objective for training descriptive captions. In: CVPR, pp. 6964\u20136974 (2018)","DOI":"10.1109\/CVPR.2018.00728"},{"key":"1175_CR106","doi-asserted-by":"crossref","unstructured":"Ma, Z., Yang, Y., Xu, Z., et\u00a0al.: Complex event detection via multi-source video attributes. In: CVPR, pp. 2627\u20132633 (2013)","DOI":"10.1109\/CVPR.2013.339"},{"key":"1175_CR107","doi-asserted-by":"crossref","unstructured":"Mao, J., Wei, X., Yang, Y., et\u00a0al.: Learning like a child: fast novel visual concept learning from sentence descriptions of images. In: ICCV, pp. 2533\u20132541 (2015)","DOI":"10.1109\/ICCV.2015.291"},{"key":"1175_CR108","unstructured":"Mao, J., Xu, W., Yang, Y., et\u00a0al.: Deep captioning with multimodal recurrent neural networks (m-rnn). In: ICLR (2015)"},{"key":"1175_CR109","unstructured":"Maron, O., Lozano-P\u00e9rez, T.: A framework for multiple-instance learning. In: NIPS, pp. 570\u2013576 (1997)"},{"key":"1175_CR110","unstructured":"Marr, D.: Vision: a computational investigation into the human representation and processing of visual information. mit press. Cambridge, Massachusetts (1982)"},{"key":"1175_CR111","doi-asserted-by":"crossref","unstructured":"Mathews, A.P., Xie, L., He, X.: Semstyle: learning to generate stylised image captions using unaligned text. In: CVPR, pp. 8591\u20138600 (2018)","DOI":"10.1109\/CVPR.2018.00896"},{"key":"1175_CR112","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J., et\u00a0al.: Howto100m: learning a text-video embedding by watching hundred million narrated video clips. In: ICCV, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"1175_CR113","unstructured":"Mikolov, T., Chen, K., Corrado, G., et\u00a0al.: Efficient estimation of word representations in vector space. In: ICLR (Workshop Poster) (2013)"},{"key":"1175_CR114","unstructured":"Mitchell, M., Dodge, J., Goyal, A., et\u00a0al.: Midge: generating image descriptions from computer vision detections. In: EACL, pp. 747\u2013756 (2012)"},{"key":"1175_CR115","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., Chen, X., Liu, X., et\u00a0al.: The role of context for object detection and semantic segmentation in the wild. In: CVPR, pp. 891\u2013898 (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"1175_CR116","doi-asserted-by":"crossref","unstructured":"Mun, J., Yang, L., Ren, Z., et\u00a0al.: Streamlined dense video captioning. In: CVPR, pp. 6588\u20136597 (2019)","DOI":"10.1109\/CVPR.2019.00675"},{"key":"1175_CR117","doi-asserted-by":"crossref","unstructured":"Pan, Y., Mei, T., Yao, T., et\u00a0al.: Jointly modeling embedding and translation to bridge video and language. In: CVPR, pp. 4594\u20134602 (2016b)","DOI":"10.1109\/CVPR.2016.497"},{"key":"1175_CR118","doi-asserted-by":"crossref","unstructured":"Pan, P., Xu, Z., Yang, Y., et\u00a0al.: Hierarchical recurrent neural encoder for video representation with application to captioning. In: CVPR, pp. 1029\u20131038 (2016a)","DOI":"10.1109\/CVPR.2016.117"},{"key":"1175_CR119","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, H., et\u00a0al.: Video captioning with transferred semantic attributes. In: CVPR, pp. 984\u2013992 (2017)","DOI":"10.1109\/CVPR.2017.111"},{"key":"1175_CR120","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., et\u00a0al.: X-linear attention networks for image captioning. In: CVPR, pp. 10,968\u201310,977 (2020)","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"1175_CR121","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., et\u00a0al.: Bleu: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1175_CR122","doi-asserted-by":"crossref","unstructured":"Park, D.H., Darrell, T., Rohrbach, A.: Robust change captioning. In: ICCV, pp. 4624\u20134633 (2019)","DOI":"10.1109\/ICCV.2019.00472"},{"key":"1175_CR123","doi-asserted-by":"crossref","unstructured":"Park, C.C., Kim, B., Kim, G.: Attend to you: personalized image captioning with context sequence memory networks. In: CVPR, pp. 6432\u20136440 (2017)","DOI":"10.1109\/CVPR.2017.681"},{"key":"1175_CR124","unstructured":"Park, C.C., Kim, G.: Expressing an image stream with a sequence of natural sentences. In: NeurIPS, pp. 73\u201381 (2015)"},{"issue":"4","key":"1175_CR125","doi-asserted-by":"publisher","first-page":"999","DOI":"10.1109\/TPAMI.2018.2824816","volume":"41","author":"CC Park","year":"2019","unstructured":"Park, C.C., Kim, B., Kim, G.: Towards personalized image captioning via multimodal memory networks. IEEE Trans. Pattern Anal. Mach. Intell. 41(4), 999\u20131012 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1175_CR126","doi-asserted-by":"crossref","unstructured":"Pasunuru, R., Bansal, M.: Multi-task video captioning with video and entailment generation. In: ACL, pp. 1273\u20131283 (2017)","DOI":"10.18653\/v1\/D17-1103"},{"issue":"3","key":"1175_CR127","doi-asserted-by":"publisher","first-page":"158","DOI":"10.1007\/s10278-004-1010-x","volume":"17","author":"JW Patriarche","year":"2004","unstructured":"Patriarche, J.W., Erickson, B.J.: A review of the automated detection of change in serial imaging studies of the brain. J. Digital Imaging 17(3), 158\u2013174 (2004)","journal-title":"J. Digital Imaging"},{"key":"1175_CR128","doi-asserted-by":"crossref","unstructured":"Pedersoli, M., Lucas, T., Schmid, C., et\u00a0al.: Areas of attention for image captioning. In: ICCV, pp. 1251\u20131259 (2017)","DOI":"10.1109\/ICCV.2017.140"},{"key":"1175_CR129","doi-asserted-by":"crossref","unstructured":"Pei, W., Zhang, J., Wang, X., et\u00a0al.: Memory-attended recurrent network for video captioning. In: CVPR, pp. 8347\u20138356 (2019)","DOI":"10.1109\/CVPR.2019.00854"},{"issue":"6","key":"1175_CR130","doi-asserted-by":"publisher","first-page":"1538","DOI":"10.1109\/TMM.2018.2877885","volume":"21","author":"Y Peng","year":"2019","unstructured":"Peng, Y., Qi, J.: Show and tell in the loop: cross-modal circular correlation learning. IEEE Trans. Multimed. 21(6), 1538\u20131550 (2019)","journal-title":"IEEE Trans. Multimed."},{"key":"1175_CR131","doi-asserted-by":"crossref","unstructured":"Perez-Martin, J., Bustos, B., P\u00e9rez, J.: Improving video captioning with temporal composition of a visual-syntactic embedding$${}^{\\text{*}}$$. In: IEEE Winter Conference on Applications of Computer Vision, pp. 3038\u20133048 (2021)","DOI":"10.1109\/WACV48630.2021.00308"},{"key":"1175_CR132","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., et\u00a0al.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: ICCV, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"1175_CR133","doi-asserted-by":"crossref","unstructured":"Prajwal, K.R., Jawahar, C.V., Kumaraguru, P.: Towards increased accessibility of meme images with the help of rich face emotion captions. In: ACM MM, pp. 202\u2013210 (2019)","DOI":"10.1145\/3343031.3350939"},{"issue":"3","key":"1175_CR134","doi-asserted-by":"publisher","first-page":"294","DOI":"10.1109\/TIP.2004.838698","volume":"14","author":"RJ Radke","year":"2005","unstructured":"Radke, R.J., Andra, S., Al-Kofahi, O., et al.: Image change detection algorithms: a systematic survey. IEEE Trans. Image Process. 14(3), 294\u2013307 (2005)","journal-title":"IEEE Trans. Image Process."},{"key":"1175_CR135","doi-asserted-by":"crossref","unstructured":"Ramanishka, V., Das, A., Park, D.H., et\u00a0al.: Multimodal video description. In: ACM MM, pp. 1092\u20131096 (2016)","DOI":"10.1145\/2964284.2984066"},{"key":"1175_CR136","unstructured":"Ranzato, M., Chopra, S., Auli, M., et\u00a0al.: Sequence level training with recurrent neural networks. In: ICLR (2016)"},{"key":"1175_CR137","doi-asserted-by":"crossref","unstructured":"Ren, Z., Wang, X., Zhang, N., et\u00a0al.: Deep reinforcement learning-based image captioning with embedding reward. In: CVPR, pp. 1151\u20131159 (2017)","DOI":"10.1109\/CVPR.2017.128"},{"key":"1175_CR138","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., et\u00a0al.: Self-critical sequence training for image captioning. In: CVPR, pp. 1179\u20131195 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"1175_CR139","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Schiele, B.: The long-short story of movie description. In: GCPR, pp. 209\u2013221 (2015)","DOI":"10.1007\/978-3-319-24947-6_17"},{"key":"1175_CR140","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., et\u00a0al.: A dataset for movie description. In: CVPR, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"1175_CR141","doi-asserted-by":"crossref","unstructured":"Ryu, H., Kang, S., Kang, H., et\u00a0al.: Semantic grouping network for video captioning. In: AAAI, pp. 2514\u20132522 (2021)","DOI":"10.1609\/aaai.v35i3.16353"},{"key":"1175_CR142","doi-asserted-by":"crossref","unstructured":"Sakurada, K., Okatani, T.: Change detection from a street image pair using CNN features and superpixel segmentation. In: BMVC, pp. 61.1\u201361.12 (2015)","DOI":"10.5244\/C.29.61"},{"key":"1175_CR143","doi-asserted-by":"crossref","unstructured":"Seo, P.H., Nagrani, A., Arnab, A., et\u00a0al.: End-to-end generative pretraining for multimodal video captioning. In: CVPR, pp. 17,938\u201317,947 (2022)","DOI":"10.1109\/CVPR52688.2022.01743"},{"key":"1175_CR144","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., et\u00a0al.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL, pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"1175_CR145","doi-asserted-by":"crossref","unstructured":"Shen, T., Kar, A., Fidler, S.: Learning to caption images through a lifetime by asking questions. In: ICCV, pp. 10,392\u201310,401 (2019)","DOI":"10.1109\/ICCV.2019.01049"},{"key":"1175_CR146","doi-asserted-by":"crossref","unstructured":"Shen, Z., Li, J., Su, Z., et\u00a0al.: Weakly supervised dense video captioning. In: CVPR, pp. 5159\u20135167 (2017)","DOI":"10.1109\/CVPR.2017.548"},{"key":"1175_CR147","doi-asserted-by":"crossref","unstructured":"Shetty, R., Laaksonen, J.: Frame- and segment-level features and candidate pool evaluation for video caption generation. In: ACM MM, pp. 1073\u20131076 (2016)","DOI":"10.1145\/2964284.2984062"},{"key":"1175_CR148","unstructured":"Shetty, R., Laaksonen, J.: Video captioning with recurrent networks based on frame- and video-level features and visual content classification. CoRR arXiv: abs\/1512.02949 (2015)"},{"key":"1175_CR149","doi-asserted-by":"crossref","unstructured":"Shetty, R., Rohrbach, M., Hendricks, L.A., et\u00a0al.: Speaking the same language: Matching machine to human captions by adversarial training. In: ICCV, pp. 4155\u20134164 (2017)","DOI":"10.1109\/ICCV.2017.445"},{"key":"1175_CR150","doi-asserted-by":"crossref","unstructured":"Shi, X., Cai, J., Joty, S.R., et\u00a0al.: (2019) Watch it twice: Video captioning with a refocused video encoder. In: ACMMM, pp. 818\u2013826","DOI":"10.1145\/3343031.3351060"},{"key":"1175_CR151","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: ICLR (2015)"},{"key":"1175_CR152","doi-asserted-by":"crossref","unstructured":"Song, Y., Chen, S., Zhao, Y., et\u00a0al.: Unpaired cross-lingual image caption generation with self-supervised rewards. In: ACM MM, pp. 784\u2013792 (2019)","DOI":"10.1145\/3343031.3350996"},{"key":"1175_CR153","doi-asserted-by":"crossref","unstructured":"Song, J., Gao, L., Guo, Z., et\u00a0al.: Hierarchical LSTM with adjusted temporal attention for video captioning. In: IJCAI, pp. 2737\u20132743 (2017)","DOI":"10.24963\/ijcai.2017\/381"},{"key":"1175_CR154","unstructured":"Song, J., Guo, Y., Gao, L., et\u00a0al.: From deterministic to generative: multi-modal stochastic RNNs for video captioning. CoRR. (2017). arXiv: abs\/1708.02478"},{"key":"1175_CR155","doi-asserted-by":"crossref","unstructured":"Song, X., Wang, B., Chen, G., et\u00a0al.: MUCH: mutual coupling enhancement of scene recognition and dense captioning. In: ACMMM, pp. 793\u2013801 (2019)","DOI":"10.1145\/3343031.3350913"},{"key":"1175_CR156","doi-asserted-by":"crossref","unstructured":"Subramanian, S., Rajeswar, S., Dutil, F., et\u00a0al.: Adversarial generation of natural language. In: Rep4NLP@ACL, pp. 241\u2013251 (2017)","DOI":"10.18653\/v1\/W17-2629"},{"key":"1175_CR157","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., et\u00a0al.: Videobert: a joint model for video and language representation learning. In: ICCV, pp. 7463\u20137472 (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"1175_CR158","unstructured":"Sutton, R.S., McAllester, D.A., Singh, S.P., et\u00a0al.: Policy gradient methods for reinforcement learning with function approximation. In: NIPS, pp. 1057\u20131063 (1999)"},{"key":"1175_CR159","unstructured":"Torabi, A., Pal, C.J., Larochelle, H., et\u00a0al.: Using descriptive video services to create a large data source for video annotation research. CoRR. (2015). arXiv: abs\/1503.01070"},{"key":"1175_CR160","doi-asserted-by":"crossref","unstructured":"Tran, K., He, X., Zhang, L., et\u00a0al.: Rich image captioning in the wild. In: CVPR, pp. 434\u2013441 (2016)","DOI":"10.1109\/CVPRW.2016.61"},{"key":"1175_CR161","doi-asserted-by":"crossref","unstructured":"Tran, A., Mathews, A.P., Xie, L.: Transform and tell: entity-aware news image captioning. In: CVPR, pp. 13,032\u201313,042 (2020)","DOI":"10.1109\/CVPR42600.2020.01305"},{"key":"1175_CR162","doi-asserted-by":"crossref","unstructured":"Tu, Y., Zhang, X., Liu, B., et\u00a0al.: Video description with spatial-temporal attention. In: ACMMM, pp. 1014\u20131022 (2017)","DOI":"10.1145\/3123266.3123354"},{"key":"1175_CR163","unstructured":"van Miltenburg, E., Elliott, D., Vossen, P.: Measuring the diversity of automatic image descriptions. In: COLING, pp. 1730\u20131741 (2018)"},{"key":"1175_CR164","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et\u00a0al.: Attention is all you need. In: NIPS, pp. 5998\u20136008 (2017)"},{"key":"1175_CR165","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: Cider: consensus-based image description evaluation. In: CVPR, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1175_CR166","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Hendricks, L.A., Mooney, R.J., et\u00a0al.: Improving lstm-based video description with linguistic knowledge mined from text. In: EMNLP, pp. 1961\u20131966 (2016)","DOI":"10.18653\/v1\/D16-1204"},{"key":"1175_CR167","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Hendricks, L.A., Rohrbach, M., et\u00a0al.: Captioning images with diverse objects. In: CVPR, pp. 1170\u20131178 (2017)","DOI":"10.1109\/CVPR.2017.130"},{"key":"1175_CR168","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., et\u00a0al.: Sequence to sequence - video to text. In: ICCV, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"1175_CR169","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J, et\u00a0al.: Translating videos to natural language using deep recurrent neural networks. In: NAACL, pp. 1494\u20131504 (2015)","DOI":"10.3115\/v1\/N15-1173"},{"key":"1175_CR170","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., et\u00a0al.: Show and tell: a neural image caption generator. In: CVPR, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"issue":"4","key":"1175_CR171","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2017","unstructured":"Vinyals, O., Toshev, A., Bengio, S., et al.: Show and tell: lessons learned from the 2015 MSCOCO image captioning challenge. IEEE Trans. Pattern Anal. Mach. Intell. 39(4), 652\u2013663 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1175_CR172","unstructured":"Viola, P.A., Platt, J.C., Zhang, C.: Multiple instance boosting for object detection. In: NIPS, pp. 1417\u20131424 (2005)"},{"key":"1175_CR173","doi-asserted-by":"crossref","unstructured":"Vo, D.M., Chen, H., Sugimoto, A., et\u00a0al.: NOC-REK: novel object captioning with retrieved vocabulary from external knowledge. In: CVPR, pp. 17,979\u201317,987 (2022)","DOI":"10.1109\/CVPR52688.2022.01747"},{"key":"1175_CR174","doi-asserted-by":"crossref","unstructured":"Wang, Q., Chan, A.B.: Describing like humans: on diversity in image captioning. In: CVPR, pp. 4195\u20134203 (2019)","DOI":"10.1109\/CVPR.2019.00432"},{"key":"1175_CR175","doi-asserted-by":"crossref","unstructured":"Wang, X., Chen, W., Wu, J., et\u00a0al.: Video captioning via hierarchical reinforcement learning. In: CVPR, pp. 4213\u20134222 (2018)","DOI":"10.1109\/CVPR.2018.00443"},{"key":"1175_CR176","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., et\u00a0al.: Controllable video captioning with POS sequence guidance based on gated fusion network. In: ICCV, pp. 2641\u20132650 (2019)","DOI":"10.1109\/ICCV.2019.00273"},{"key":"1175_CR177","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., et\u00a0al.: Reconstruction network for video captioning. In: CVPR, pp. 7622\u20137631 (2018)","DOI":"10.1109\/CVPR.2018.00795"},{"key":"1175_CR178","doi-asserted-by":"crossref","unstructured":"Wang, J., Wang, W., Huang, Y., et\u00a0al: M3: multimodal memory modelling for video captioning. In: CVPR, pp. 7512\u20137520 (2018)","DOI":"10.1109\/CVPR.2018.00784"},{"key":"1175_CR179","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xu, J., Sun, Y.: End-to-end transformer based model for image captioning. In: AAAI, pp. 2585\u20132594 (2022)","DOI":"10.1609\/aaai.v36i3.20160"},{"issue":"2","key":"1175_CR180","doi-asserted-by":"publisher","first-page":"1035","DOI":"10.1109\/TPAMI.2020.3013834","volume":"44","author":"Q Wang","year":"2022","unstructured":"Wang, Q., Wan, J., Chan, A.B.: On diversity in image captioning: metrics and methods. IEEE Trans. Pattern Anal. Mach. Intell. 44(2), 1035\u20131049 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1175_CR181","doi-asserted-by":"publisher","first-page":"229","DOI":"10.1007\/BF00992696","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams, R.J.: Simple statistical gradient-following algorithms for connectionist reinforcement learning. Mach. Learn. 8, 229\u2013256 (1992)","journal-title":"Mach. Learn."},{"key":"1175_CR182","doi-asserted-by":"crossref","unstructured":"Wu, Q., Shen, C., Liu, L., et\u00a0al.: What value do explicit high level concepts have in vision to language problems? In: CVPR, pp. 203\u2013212 (2016)","DOI":"10.1109\/CVPR.2016.29"},{"key":"1175_CR183","doi-asserted-by":"crossref","unstructured":"Wu, M., Zhang, X., Sun, X., et\u00a0al.: Difnet: boosting visual information flow for image captioning. In: CVPR, pp. 17,999\u201318,008 (2022)","DOI":"10.1109\/CVPR52688.2022.01749"},{"issue":"11","key":"1175_CR184","doi-asserted-by":"publisher","first-page":"5241","DOI":"10.1109\/TIP.2019.2917229","volume":"28","author":"Y Xian","year":"2019","unstructured":"Xian, Y., Tian, Y.: Self-guiding multimodal LSTM\u2014when we do not have a perfect training dataset for image captioning. IEEE Trans. Image Process. 28(11), 5241\u20135252 (2019)","journal-title":"IEEE Trans. Image Process."},{"issue":"11","key":"1175_CR185","doi-asserted-by":"publisher","first-page":"2942","DOI":"10.1109\/TMM.2019.2915033","volume":"21","author":"X Xiao","year":"2019","unstructured":"Xiao, X., Wang, L., Ding, K., et al.: Deep hierarchical encoder-decoder network for image captioning. IEEE Trans. Multimed. 21(11), 2942\u20132956 (2019)","journal-title":"IEEE Trans. Multimed."},{"key":"1175_CR186","unstructured":"Xu, K., Ba, J., Kiros, R., et\u00a0al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML, pp. 2048\u20132057 (2015)"},{"key":"1175_CR187","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., et\u00a0al.: MSR-VTT: a large video description dataset for bridging video and language. In: CVPR, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"1175_CR188","doi-asserted-by":"crossref","unstructured":"Xu, N., Zhang, H., Liu, A.A., et\u00a0al.: Multi-level policy and reward-based deep reinforcement learning framework for image captioning. IEEE Trans. Multimed. (2020)","DOI":"10.1109\/TMM.2019.2941820"},{"key":"1175_CR189","doi-asserted-by":"crossref","unstructured":"Xu, D., Zhu, Y., Choy, C.B., et\u00a0al.: Scene graph generation by iterative message passing. In: CVPR, pp. 3097\u20133106 (2017)","DOI":"10.1109\/CVPR.2017.330"},{"key":"1175_CR190","doi-asserted-by":"publisher","first-page":"477","DOI":"10.1016\/j.jvcir.2018.12.027","volume":"58","author":"N Xu","year":"2019","unstructured":"Xu, N., Liu, A., Liu, J., et al.: Scene graph Captioner: image captioning based on structural visual representation. J. Vis. Commun. Image Represent. 58, 477\u2013485 (2019)","journal-title":"J. Vis. Commun. Image Represent."},{"issue":"8","key":"1175_CR191","doi-asserted-by":"publisher","first-page":"2482","DOI":"10.1109\/TCSVT.2018.2867286","volume":"29","author":"N Xu","year":"2019","unstructured":"Xu, N., Liu, A., Wong, Y., et al.: Dual-stream recurrent neural network for video captioning. IEEE Trans. Circ. Syst. Video Technol. 29(8), 2482\u20132493 (2019)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"1175_CR192","doi-asserted-by":"crossref","unstructured":"Yang, X., Karaman, S., Tetreault, J.R., et\u00a0al.: Journalistic guidelines aware news image captioning. In: EMNLP, pp. 5162\u20135175 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.419"},{"key":"1175_CR193","doi-asserted-by":"crossref","unstructured":"Yang, L., Tang, K.D., Yang, J., et\u00a0al.: Dense captioning with joint inference and visual context. In: CVPR, pp. 1978\u20131987 (2017)","DOI":"10.1109\/CVPR.2017.214"},{"key":"1175_CR194","doi-asserted-by":"crossref","unstructured":"Yang, X., Tang, K., Zhang, H., et\u00a0al.: Auto-encoding scene graphs for image captioning. In: CVPR, pp. 10,685\u201310,694 (2019)","DOI":"10.1109\/CVPR.2019.01094"},{"key":"1175_CR195","unstructured":"Yang, Z., Yuan, Y., Wu, Y., et\u00a0al.: Encode, review, and decode: reviewer module for caption generation. CoRR. abs\/1605.07912 (2016)"},{"key":"1175_CR196","doi-asserted-by":"crossref","unstructured":"Yang, X., Zhang, H., Cai, J.: Learning to collocate neural modules for image captioning. In: ICCV, pp. 4249\u20134259 (2019)","DOI":"10.1109\/ICCV.2019.00435"},{"key":"1175_CR197","doi-asserted-by":"crossref","unstructured":"Yang, B., Zou, Y., Liu, F., et\u00a0al.: Non-autoregressive coarse-to-fine video captioning. In: AAAI, pp. 3119\u20133127 (2021)","DOI":"10.1609\/aaai.v35i4.16421"},{"issue":"11","key":"1175_CR198","doi-asserted-by":"publisher","first-page":"5600","DOI":"10.1109\/TIP.2018.2855422","volume":"27","author":"Y Yang","year":"2018","unstructured":"Yang, Y., Zhou, J., Ai, J., et al.: Video captioning by adversarial LSTM. IEEE Trans. Image Process. 27(11), 5600\u20135611 (2018)","journal-title":"IEEE Trans. Image Process."},{"issue":"4","key":"1175_CR199","doi-asserted-by":"publisher","first-page":"1047","DOI":"10.1109\/TMM.2018.2869276","volume":"21","author":"M Yang","year":"2019","unstructured":"Yang, M., Zhao, W., Xu, W., et al.: Multitask learning for cross-domain image captioning. IEEE Trans. Multimed. 21(4), 1047\u20131061 (2019)","journal-title":"IEEE Trans. Multimed."},{"key":"1175_CR200","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., et\u00a0al.: Boosting image captioning with attributes. In: ICCV, pp. 4904\u20134912 (2017)","DOI":"10.1109\/ICCV.2017.524"},{"key":"1175_CR201","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., et\u00a0al.: Exploring visual relationship for image captioning. In: ECCV, pp. 711\u2013727 (2018)","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"1175_CR202","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., et\u00a0al.: Hierarchy parsing for image captioning. In: ICCV, pp. 2621\u20132629 (2019)","DOI":"10.1109\/ICCV.2019.00271"},{"key":"1175_CR203","doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., et\u00a0al.: Describing videos by exploiting temporal structure. In: ICCV, pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"1175_CR204","doi-asserted-by":"crossref","unstructured":"Yin, G., Sheng, L., Liu, B., et\u00a0al.: Context and attribute grounded dense captioning. In: CVPR, pp. 6241\u20136250 (2019)","DOI":"10.1109\/CVPR.2019.00640"},{"key":"1175_CR205","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., et\u00a0al.: Image captioning with semantic attention. In: CVPR, pp. 4651\u20134659 (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"1175_CR206","doi-asserted-by":"crossref","unstructured":"Yu, H., Wang, J., Huang, Z, et\u00a0al.: Video paragraph captioning using hierarchical recurrent neural networks. In: CVPR, pp. 4584\u20134593 (2016)","DOI":"10.1109\/CVPR.2016.496"},{"key":"1175_CR207","doi-asserted-by":"crossref","unstructured":"Yu, L., Zhang, W., Wang, J., et\u00a0al.: Seqgan: sequence generative adversarial nets with policy gradient. In: AAAI, pp. 2852\u20132858 (2017)","DOI":"10.1609\/aaai.v31i1.10804"},{"key":"1175_CR208","doi-asserted-by":"crossref","unstructured":"Zeng, P., Zhang, H., Song, J., et\u00a0al.: S2 transformer for image captioning. In: IJCAI, pp. 1608\u20131614 (2022)","DOI":"10.24963\/ijcai.2022\/224"},{"key":"1175_CR209","doi-asserted-by":"crossref","unstructured":"Zhang, H., Dana, K.J., Shi, J., et\u00a0al.: Context encoding for semantic segmentation. In: CVPR, pp. 7151\u20137160 (2018)","DOI":"10.1109\/CVPR.2018.00747"},{"key":"1175_CR210","doi-asserted-by":"crossref","unstructured":"Zhang, J., Fang, S., Mao, Z., et\u00a0al.: Fine-tuning with multi-modal entity prompts for news image captioning. In: ACM MM, pp. 4365\u20134373 (2022)","DOI":"10.1145\/3503161.3547883"},{"key":"1175_CR211","doi-asserted-by":"crossref","unstructured":"Zhang, X., Gao, K., Zhang, Y., et\u00a0al.: Task-driven dynamic fusion: reducing ambiguity in video description. In: CVPR, pp. 6250\u20136258 (2017)","DOI":"10.1109\/CVPR.2017.662"},{"key":"1175_CR212","doi-asserted-by":"crossref","unstructured":"Zhang, P., Li, X., Hu, X, et\u00a0al.: Vinvl: Revisiting visual representations in vision-language models. In: CVPR, pp. 5579\u20135588 (2021)","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"1175_CR213","doi-asserted-by":"crossref","unstructured":"Zhang, J., Peng, Y.: Object-aware aggregation with bidirectional temporal graph for video captioning. In: CVPR, pp. 8327\u20138336 (2019)","DOI":"10.1109\/CVPR.2019.00852"},{"key":"1175_CR214","doi-asserted-by":"crossref","unstructured":"Zhang, X., Sun, X., Luo, Y., et\u00a0al.: Rstnet: captioning with adaptive attention on visual and non-visual words. In: CVPR, pp. 15,465\u201315,474 (2021)","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"1175_CR215","unstructured":"Zhang, L., Sung, F., Liu, F., et\u00a0al.: Actor-critic sequence training for image captioning. CoRR. (2017). arXiv: abs\/1706.09601"},{"key":"1175_CR216","unstructured":"Zhao, W., Hu, Y., Wang, H., et\u00a0al.: Boosting entity-aware image captioning with multi-modal knowledge graph. CoRR. (2021). arXiv: abs\/2107.11970"},{"key":"1175_CR217","doi-asserted-by":"crossref","unstructured":"Zhao, B., Li, X., Lu, X.: Video captioning with tube features. In: IJCAI, pp. 1177\u20131183 (2018)","DOI":"10.24963\/ijcai.2018\/164"},{"key":"1175_CR218","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., et\u00a0al.: Pyramid scene parsing network. In: CVPR, pp. 6230\u20136239 (2017)","DOI":"10.1109\/CVPR.2017.660"},{"issue":"11","key":"1175_CR219","doi-asserted-by":"publisher","first-page":"5552","DOI":"10.1109\/TIP.2019.2916757","volume":"28","author":"B Zhao","year":"2019","unstructured":"Zhao, B., Li, X., Lu, X.: CAM-RNN: co-attention model based RNN for video captioning. IEEE Trans. Image Process. 28(11), 5552\u20135565 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"1175_CR220","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Li, Y., Wang, S.: Intention oriented image captions with guiding objects. In: CVPR, pp. 8395\u20138404 (2019)","DOI":"10.1109\/CVPR.2019.00859"},{"key":"1175_CR221","unstructured":"Zhou, L., Palangi, H., Zhang, L., et\u00a0al.: Unified vision-language pre-training for image captioning and VQA. In: AAAI, pp. 13,041\u201313,049 (2020)"},{"key":"1175_CR222","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Y., Corso, J.J., et\u00a0al.: End-to-end dense video captioning with masked transformer. In: CVPR, pp. 8739\u20138748 (2018)","DOI":"10.1109\/CVPR.2018.00911"},{"key":"1175_CR223","doi-asserted-by":"publisher","first-page":"694","DOI":"10.1109\/TIP.2019.2928144","volume":"29","author":"L Zhou","year":"2020","unstructured":"Zhou, L., Zhang, Y., Jiang, Y., et al.: Re-caption: saliency-enhanced image captioning through two-phase learning. IEEE Trans. Image Process. 29, 694\u2013709 (2020)","journal-title":"IEEE Trans. Image Process."}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01175-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-023-01175-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01175-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,16]],"date-time":"2023-11-16T11:17:33Z","timestamp":1700133453000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-023-01175-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,21]]},"references-count":223,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["1175"],"URL":"https:\/\/doi.org\/10.1007\/s00530-023-01175-x","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,9,21]]},"assertion":[{"value":"15 April 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 August 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 September 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}