{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T10:06:36Z","timestamp":1771668396919,"version":"3.50.1"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,1,9]],"date-time":"2025-01-09T00:00:00Z","timestamp":1736380800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,1,9]],"date-time":"2025-01-09T00:00:00Z","timestamp":1736380800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Complex Intell. Syst."],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s40747-024-01741-4","type":"journal-article","created":{"date-parts":[[2025,1,9]],"date-time":"2025-01-09T08:40:37Z","timestamp":1736412037000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["MKER: multi-modal knowledge extraction and reasoning for future event prediction"],"prefix":"10.1007","volume":"11","author":[{"given":"Chenghang","family":"Lai","sequence":"first","affiliation":[]},{"given":"Shoumeng","family":"Qiu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,9]]},"reference":[{"key":"1741_CR1","doi-asserted-by":"crossref","unstructured":"Caba Heilbron F, Escorcia V, Ghanem B, Carlos Niebles J (2015) ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 961\u2013970","DOI":"10.1109\/CVPR.2015.7298698"},{"issue":"12","key":"1741_CR2","doi-asserted-by":"publisher","first-page":"4057","DOI":"10.1162\/jocn_a_00078","volume":"23","author":"JM Zacks","year":"2011","unstructured":"Zacks JM, Kurby CA, Eisenberg ML, Haroutunian N (2011) Prediction error associated with the perceptual segmentation of naturalistic events. J Cognit Neurosci 23(12):4057\u20134066","journal-title":"J Cognit Neurosci"},{"issue":"1","key":"1741_CR3","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1207\/s15516709cog1901_3","volume":"19","author":"K Stenning","year":"1995","unstructured":"Stenning K, Oberlander J (1995) A cognitive theory of graphical and linguistic reasoning: logic and implementation. Cogn Sci 19(1):97\u2013140","journal-title":"Cogn Sci"},{"issue":"3","key":"1741_CR4","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1007\/s13218-019-00603-3","volume":"33","author":"U Furbach","year":"2019","unstructured":"Furbach U, H\u00f6lldobler S, Ragni M, Schon C, Stolzenburg F (2019) Cognitive reasoning: a personal view. KI-K\u00fcnstliche Intelligenz 33(3):209\u2013217","journal-title":"KI-K\u00fcnstliche Intelligenz"},{"key":"1741_CR5","unstructured":"OpenAI: ChatGPT (2022). https:\/\/openai.com\/blog\/chatgpt"},{"key":"1741_CR6","doi-asserted-by":"crossref","unstructured":"Min S, Lewis M, Zettlemoyer L, Hajishirzi H (2022) MetaICL: learning to learn in context. In: Proceedings of the 2022 conference of the North American chapter of the Association for Computational Linguistics: Human Language Technologies, pp 2791\u20132809","DOI":"10.18653\/v1\/2022.naacl-main.201"},{"key":"1741_CR7","unstructured":"Brown TB (2020) Language models are few-shot learners. arXiv preprint arXiv:2005.14165"},{"key":"1741_CR8","volume-title":"The book of why: the new science of cause and effect","author":"J Pearl","year":"2018","unstructured":"Pearl J, Mackenzie D (2018) The book of why: the new science of cause and effect. Basic Books, New York"},{"issue":"7553","key":"1741_CR9","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun Y, Bengio Y, Hinton G (2015) Deep learning. Nature 521(7553):436\u2013444","journal-title":"Nature"},{"issue":"3","key":"1741_CR10","doi-asserted-by":"publisher","first-page":"255","DOI":"10.1037\/h0084295","volume":"45","author":"A Paivio","year":"1991","unstructured":"Paivio A (1991) Dual coding theory: retrospect and current status. Can J Psychol\/Revue canadienne de psychologie 45(3):255","journal-title":"Can J Psychol\/Revue canadienne de psychologie"},{"issue":"12","key":"1741_CR11","doi-asserted-by":"publisher","first-page":"1003966","DOI":"10.1371\/journal.pcbi.1003966","volume":"10","author":"L Albantakis","year":"2014","unstructured":"Albantakis L, Hintze A, Koch C, Adami C, Tononi G (2014) Evolution of integrated causal structures in animats exposed to environments of increasing complexity. PLoS Comput Biol 10(12):1003966","journal-title":"PLoS Comput Biol"},{"issue":"1","key":"1741_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1162\/NECO_a_00912","volume":"29","author":"K Friston","year":"2017","unstructured":"Friston K, FitzGerald T, Rigoli F, Schwartenbeck P, Pezzulo G (2017) Active inference: a process theory. Neural Comput 29(1):1\u201349","journal-title":"Neural Comput"},{"issue":"3","key":"1741_CR13","doi-asserted-by":"publisher","first-page":"271","DOI":"10.1080\/096725500750039282","volume":"8","author":"N Kompridis","year":"2000","unstructured":"Kompridis N (2000) So we need something else for reason to mean. Int J Philos Stud 8(3):271\u2013295","journal-title":"Int J Philos Stud"},{"key":"1741_CR14","doi-asserted-by":"crossref","unstructured":"Teney D, Liu L, Den Hengel A (2017) Graph-structured representations for visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1\u20139","DOI":"10.1109\/CVPR.2017.344"},{"key":"1741_CR15","doi-asserted-by":"crossref","unstructured":"Johnson J, Gupta A, Fei-Fei L (2018) Image generation from scene graphs. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1219\u20131228","DOI":"10.1109\/CVPR.2018.00133"},{"key":"1741_CR16","doi-asserted-by":"publisher","first-page":"2742","DOI":"10.1109\/TIP.2019.2952088","volume":"29","author":"Y Ji","year":"2019","unstructured":"Ji Y, Zhan Y, Yang Y, Xu X, Shen F, Shen HT (2019) A context knowledge map guided coarse-to-fine action recognition. IEEE Trans Image Process 29:2742\u20132752","journal-title":"IEEE Trans Image Process"},{"key":"1741_CR17","doi-asserted-by":"crossref","unstructured":"Lin BY, Chen X, Chen J, Ren X (2019) KagNet: knowledge-aware graph networks for commonsense reasoning. In: Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP), pp 2829\u20132839","DOI":"10.18653\/v1\/D19-1282"},{"key":"1741_CR18","doi-asserted-by":"publisher","first-page":"5929","DOI":"10.1007\/s10462-020-09838-1","volume":"53","author":"G Van Houdt","year":"2020","unstructured":"Van Houdt G, Mosquera C, N\u00e1poles G (2020) A review on the long short-term memory model. Artif Intell Rev 53:5929\u20135955","journal-title":"Artif Intell Rev"},{"key":"1741_CR19","doi-asserted-by":"crossref","unstructured":"Guo W, Du Y, Shen X, Lepetit V, Alameda-Pineda X, Moreno-Noguer F (2023) Back to MLP: a simple baseline for human motion prediction. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 4809\u20134819","DOI":"10.1109\/WACV56688.2023.00479"},{"issue":"2","key":"1741_CR20","doi-asserted-by":"publisher","first-page":"537","DOI":"10.1007\/s00426-022-01674-y","volume":"87","author":"A Weiden","year":"2023","unstructured":"Weiden A, Porcu E, Liepelt R (2023) Action prediction modulates self-other integration in joint action. Psychol Res 87(2):537\u2013552","journal-title":"Psychol Res"},{"key":"1741_CR21","doi-asserted-by":"crossref","unstructured":"Mahdavian M, Nikdel P, TaherAhmadi M, Chen M (2023) STPOTR: simultaneous human trajectory and pose prediction using a non-autoregressive transformer for robot follow-ahead. In: 2023 IEEE international conference on robotics and automation (ICRA). IEEE, pp 9959\u20139965","DOI":"10.1109\/ICRA48891.2023.10160538"},{"issue":"8","key":"1741_CR22","doi-asserted-by":"publisher","first-page":"3689","DOI":"10.1109\/TCSVT.2023.3239322","volume":"33","author":"J Tang","year":"2023","unstructured":"Tang J, Zhang J, Ding R, Gu B, Yin J (2023) Collaborative multi-dynamic pattern modeling for human motion prediction. IEEE Trans Circuits Syst Video Technol 33(8):3689\u20133700","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"issue":"2","key":"1741_CR23","doi-asserted-by":"publisher","first-page":"2843","DOI":"10.1109\/TII.2023.3298476","volume":"20","author":"Y Liu","year":"2024","unstructured":"Liu Y, Liu J, Yang K, Ju B, Liu S, Wang Y, Yang D, Sun P, Song L (2024) AMP-Net: appearance-motion prototype network assisted automatic video anomaly detection system. IEEE Trans Ind Inform 20(2):2843\u20132855","journal-title":"IEEE Trans Ind Inform"},{"issue":"3","key":"1741_CR24","doi-asserted-by":"publisher","first-page":"3240","DOI":"10.1007\/s10489-022-03613-1","volume":"53","author":"V-T Le","year":"2023","unstructured":"Le V-T, Kim Y-G (2023) Attention-based residual autoencoder for video anomaly detection. Appl Intell 53(3):3240\u20133254","journal-title":"Appl Intell"},{"key":"1741_CR25","doi-asserted-by":"crossref","unstructured":"Lin Z, Sun J, Hu J-F, Yu Q, Lai J-H, Zheng W-S (2021) Predictive feature learning for future segmentation prediction. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7365\u20137374","DOI":"10.1109\/ICCV48922.2021.00727"},{"key":"1741_CR26","doi-asserted-by":"crossref","unstructured":"Peri N, Luiten J, Li M, O\u0161ep A, Leal-Taix\u00e9 L, Ramanan D (2022) Forecasting from lidar via future object detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 17202\u201317211","DOI":"10.1109\/CVPR52688.2022.01669"},{"key":"1741_CR27","doi-asserted-by":"crossref","unstructured":"Gao Z, Tan C, Wu L, Li SZ (2022) SimVP: simpler yet better video prediction. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3170\u20133180","DOI":"10.1109\/CVPR52688.2022.00317"},{"key":"1741_CR28","doi-asserted-by":"crossref","unstructured":"Girdhar R, Grauman K (2021) Anticipative video transformer. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 13505\u201313515","DOI":"10.1109\/ICCV48922.2021.01325"},{"issue":"11","key":"1741_CR29","doi-asserted-by":"publisher","first-page":"7505","DOI":"10.1109\/TPAMI.2021.3129349","volume":"44","author":"W Luo","year":"2021","unstructured":"Luo W, Liu W, Lian D, Gao S (2021) Future frame prediction network for video anomaly detection. IEEE Trans Pattern Anal Mach Intell 44(11):7505\u20137520","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1741_CR30","unstructured":"Santoro A, Raposo D, Barrett DG, Malinowski M, Pascanu R, Battaglia P, Lillicrap T (2017) A simple neural network module for relational reasoning. In: Proceedings of the international conference on neural information processing systems, pp 4974\u20134983"},{"key":"1741_CR31","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126390","volume":"550","author":"P Shao","year":"2023","unstructured":"Shao P, He J, Li G, Zhang D, Tao J (2023) Hierarchical graph attention network for temporal knowledge graph reasoning. Neurocomputing 550:126390","journal-title":"Neurocomputing"},{"key":"1741_CR32","doi-asserted-by":"publisher","first-page":"249","DOI":"10.1016\/j.neucom.2022.02.011","volume":"483","author":"G Niu","year":"2022","unstructured":"Niu G, Li B, Zhang Y, Sheng Y, Shi C, Li J, Pu S (2022) Joint semantics and data-driven path representation for knowledge graph reasoning. Neurocomputing 483:249\u2013261","journal-title":"Neurocomputing"},{"issue":"2","key":"1741_CR33","doi-asserted-by":"publisher","first-page":"201","DOI":"10.26599\/BDMA.2022.9020021","volume":"6","author":"X Wu","year":"2023","unstructured":"Wu X, Duan J, Pan Y, Li M (2023) Medical knowledge graph: data sources, construction, reasoning, and applications. Big Data Min Anal 6(2):201\u2013217","journal-title":"Big Data Min Anal"},{"key":"1741_CR34","doi-asserted-by":"crossref","unstructured":"Yang A, Nagrani A, Seo PH, Miech A, Pont-Tuset J, Laptev I, Sivic J, Schmid C (2023) Vid2Seq: large-scale pretraining of a visual language model for dense video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10714\u201310726","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"1741_CR35","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109204","volume":"136","author":"Y Tu","year":"2023","unstructured":"Tu Y, Zhou C, Guo J, Li H, Gao S, Yu Z (2023) Relation-aware attention for video captioning via graph learning. Pattern Recogn 136:109204","journal-title":"Pattern Recogn"},{"key":"1741_CR36","doi-asserted-by":"publisher","first-page":"10261","DOI":"10.1016\/j.artmed.2023.102611","volume":"143","author":"Z Lin","year":"2023","unstructured":"Lin Z, Zhang D, Tao Q, Shi D, Haffari G, Wu Q, He M, Ge Z (2023) Medical visual question answering: a survey. Artif Intell Med 143:10261","journal-title":"Artif Intell Med"},{"issue":"10","key":"1741_CR37","doi-asserted-by":"publisher","first-page":"11624","DOI":"10.1109\/TPAMI.2023.3284038","volume":"45","author":"Y Liu","year":"2023","unstructured":"Liu Y, Li G, Lin L (2023) Cross-modal causal relational reasoning for event-level visual question answering. IEEE Trans Pattern Anal Mach Intell 45(10):11624\u201311641","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1741_CR38","doi-asserted-by":"crossref","unstructured":"Li M, Wang H, Zhang W, Miao J, Zhao Z, Zhang S, Ji W, Wu F (2023) Winner: weakly-supervised hierarchical decomposition and alignment for spatio-temporal video grounding. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 23090\u201323099","DOI":"10.1109\/CVPR52729.2023.02211"},{"issue":"2","key":"1741_CR39","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3532626","volume":"19","author":"X Lan","year":"2023","unstructured":"Lan X, Yuan Y, Wang X, Wang Z, Zhu W (2023) A survey on temporal sentence grounding in videos. ACM Trans Multimed Comput Commun Appl 19(2):1\u201333","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"key":"1741_CR40","doi-asserted-by":"crossref","unstructured":"Voigtlaender P, Changpinyo S, Pont-Tuset J, Soricut R, Ferrari V (2023) Connecting vision and language with video localized narratives. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 2461\u20132471","DOI":"10.1109\/CVPR52729.2023.00243"},{"key":"1741_CR41","doi-asserted-by":"crossref","unstructured":"Xu C, Yang M, Li C, Shen Y, Ao X, Xu R (2021) Imagine, reason and write: visual storytelling with graph knowledge and relational reasoning. In: Proceedings of the AAAI conference on artificial intelligence, vol 35, pp 3022\u20133029","DOI":"10.1609\/aaai.v35i4.16410"},{"key":"1741_CR42","doi-asserted-by":"crossref","unstructured":"Pei W, Zhang J, Wang X, Ke L, Shen X, Tai Y-W (2019) Memory-attended recurrent network for video captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8347\u20138356","DOI":"10.1109\/CVPR.2019.00854"},{"key":"1741_CR43","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126523","volume":"552","author":"Y Wei","year":"2023","unstructured":"Wei Y, Yuan S, Chen M, Shen X, Wang L, Shen L, Yan Z (2023) MPP-Net: multi-perspective perception network for dense video captioning. Neurocomputing 552:126523","journal-title":"Neurocomputing"},{"key":"1741_CR44","doi-asserted-by":"crossref","unstructured":"Ding Y, Yu J, Liu B, Hu Y, Cui M, Wu Q (2022) MuKEA: multimodal knowledge extraction and accumulation for knowledge-based visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5089\u20135098","DOI":"10.1109\/CVPR52688.2022.00503"},{"key":"1741_CR45","doi-asserted-by":"crossref","unstructured":"Hou J, Wu X, Zhang X, Qi Y, Jia Y, Luo J (2020) Joint commonsense and relation reasoning for image and video captioning. In: Proceedings of the AAAI conference on artificial intelligence, vol 34, pp 10973\u201310980 (2020)","DOI":"10.1609\/aaai.v34i07.6731"},{"key":"1741_CR46","doi-asserted-by":"crossref","unstructured":"Yang A, Miech A, Sivic J, Laptev I, Schmid C (2022) TubeDETR: spatio-temporal video grounding with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 16442\u201316453","DOI":"10.1109\/CVPR52688.2022.01595"},{"key":"1741_CR47","doi-asserted-by":"crossref","unstructured":"Hosseinzadeh M, Wang Y (2021) Video captioning of future frames. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 980\u2013989","DOI":"10.1109\/WACV48630.2021.00102"},{"key":"1741_CR48","doi-asserted-by":"crossref","unstructured":"Lei J, Yu L, Berg TL, Bansal M (2020) What is more likely to happen next? Video-and-language future event prediction. arXiv preprint arXiv:2010.07999","DOI":"10.18653\/v1\/2020.emnlp-main.706"},{"issue":"1","key":"1741_CR49","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2012","unstructured":"Ji S, Xu W, Yang M, Yu K (2012) 3d convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35(1):221\u2013231","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1741_CR50","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J(2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning. PMLR, pp 8748\u20138763"},{"issue":"1","key":"1741_CR51","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1017\/S1351324916000334","volume":"23","author":"KW Church","year":"2017","unstructured":"Church KW (2017) Word2vec. Nat Lang Eng 23(1):155\u2013162","journal-title":"Nat Lang Eng"},{"key":"1741_CR52","unstructured":"Speer R, Havasi C (2021) Representing general relational knowledge in ConceptNet 5. In: LREC, vol 2012, pp 3679\u20133686"},{"key":"1741_CR53","doi-asserted-by":"crossref","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster R-CNN: towards real-time object detection with region proposal networks. In: Proceedings of the international conference on neural information processing systems, pp 1137\u20131149","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"1741_CR54","doi-asserted-by":"crossref","unstructured":"Zellers R, Yatskar M, Thomson S, Choi Y (2018) Neural motifs: scene graph parsing with global context. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5831\u20135840","DOI":"10.1109\/CVPR.2018.00611"},{"key":"1741_CR55","unstructured":"Narasimhan M, Lazebnik S, Schwing A (2018) Out of the box: reasoning with graph convolution nets for factual visual question answering. In: Proceedings of the international conference on neural information processing systems, pp 2659\u20132670"},{"key":"1741_CR56","unstructured":"Berg Rvd, Kipf TN, Welling M (2017) Graph convolutional matrix completion. arXiv preprint arXiv:1706.02263"},{"issue":"8","key":"1741_CR57","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"1741_CR58","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Proceedings of the international conference on neural information processing systems, pp 5998\u20136008"},{"key":"1741_CR59","doi-asserted-by":"crossref","unstructured":"Zhou L, Xu C, Corso J (2018) Towards automatic learning of procedures from web instructional videos. In: Proceedings of the AAAI conference on artificial intelligence, vol 32","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"1741_CR60","doi-asserted-by":"crossref","unstructured":"Krishna R, Hata K, Ren F, Fei-Fei L, Carlos Niebles J (2017) Dense-captioning events in videos. In: Proceedings of the IEEE international conference on computer vision (ICCV)","DOI":"10.1109\/ICCV.2017.83"},{"key":"1741_CR61","doi-asserted-by":"crossref","unstructured":"Zhou L, Zhou Y, Corso JJ, Socher R, Xiong C (2018) End-to-end dense video captioning with masked transformer. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8739\u20138748","DOI":"10.1109\/CVPR.2018.00911"},{"key":"1741_CR62","doi-asserted-by":"crossref","unstructured":"Dai Z, Yang Z, Yang Y, Carbonell J, Le QV, Salakhutdinov R (2019) Transformer-XL: attentive language models beyond a fixed-length context. arXiv preprint arXiv:1901.02860","DOI":"10.18653\/v1\/P19-1285"},{"key":"1741_CR63","doi-asserted-by":"crossref","unstructured":"Lei J, Wang L, Shen Y, Yu D, Berg T, Bansal M (2020) MART: memory-augmented recurrent transformer for coherent video paragraph captioning. In: Proceedings of the 58th annual meeting of the Association for Computational Linguistics, pp 2603\u20132614","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"1741_CR64","unstructured":"Paszke A, Gross S, Chintala S, Chanan G, Yang E, DeVito Z, Lin Z, Desmaison A, Antiga L, Lerer A (2017) Automatic differentiation in PyTorch"},{"key":"1741_CR65","unstructured":"Kingma DP (2024) Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980"},{"key":"1741_CR66","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the association for computational linguistics, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"1741_CR67","doi-asserted-by":"crossref","unstructured":"Denkowski M, Lavie A (2014) Meteor universal: language specific translation evaluation for any target language. In: Proceedings of the ninth workshop on statistical machine translation, pp 376\u2013380","DOI":"10.3115\/v1\/W14-3348"},{"key":"1741_CR68","doi-asserted-by":"crossref","unstructured":"Lin C-Y, Och FJ (2004) Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics. In: Proceedings of the 42nd annual meeting of the Association for Computational Linguistics (ACL-04), pp 605\u2013612","DOI":"10.3115\/1218955.1219032"},{"key":"1741_CR69","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence Zitnick C, Parikh D (2015) CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"}],"container-title":["Complex &amp; Intelligent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-024-01741-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s40747-024-01741-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-024-01741-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T16:27:51Z","timestamp":1738945671000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s40747-024-01741-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,9]]},"references-count":69,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["1741"],"URL":"https:\/\/doi.org\/10.1007\/s40747-024-01741-4","relation":{},"ISSN":["2199-4536","2198-6053"],"issn-type":[{"value":"2199-4536","type":"print"},{"value":"2198-6053","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1,9]]},"assertion":[{"value":"17 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 October 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 January 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This paper is neither the entire paper nor any parts of its content has been published or is under consideration for publication elsewhere.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"All of the authors agree to the submission of this paper.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"Materials is available.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Materials availability"}}],"article-number":"138"}}