{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T16:05:44Z","timestamp":1769270744308,"version":"3.49.0"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032024053","type":"print"},{"value":"9783032024060","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,8,24]],"date-time":"2025-08-24T00:00:00Z","timestamp":1755993600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,24]],"date-time":"2025-08-24T00:00:00Z","timestamp":1755993600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02406-0_14","type":"book-chapter","created":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T09:25:33Z","timestamp":1755941133000},"page":"194-207","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Video Captioning with Spatio-Temporal Graph Transformers"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6223-4502","authenticated-orcid":false,"given":"Shakhnoza","family":"Muksimova","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7694-1806","authenticated-orcid":false,"given":"Sabina","family":"Umirzakova","sequence":"additional","affiliation":[]},{"given":"Sevara","family":"Mardieva","sequence":"additional","affiliation":[]},{"given":"Nargiza","family":"Iskhakova","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0184-7599","authenticated-orcid":false,"given":"Young Im","family":"Cho","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,24]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Seo, P.H., Nagrani, A., Arnab, A., Schmid, C.: End-to-end generative pretraining for multimodal video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17959\u201317968 (2022)","DOI":"10.1109\/CVPR52688.2022.01743"},{"key":"14_CR2","doi-asserted-by":"crossref","unstructured":"Yang, A., et al.: Vid2seq: Large-scale pretraining of a visual language model for dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10714\u201310726 (2023)","DOI":"10.1109\/CVPR52729.2023.01032"},{"issue":"6","key":"14_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3712059","volume":"57","author":"I Qasim","year":"2025","unstructured":"Qasim, I., Horsch, A., Prasad, D.: Dense video captioning: a survey of techniques, datasets and evaluation protocols. ACM Comput. Surv. 57(6), 1\u201336 (2025)","journal-title":"ACM Comput. Surv."},{"issue":"1","key":"14_CR4","doi-asserted-by":"publisher","first-page":"2440640","DOI":"10.1080\/23311975.2024.2440640","volume":"12","author":"P Bijalwan","year":"2025","unstructured":"Bijalwan, P., Gupta, A., Johri, A., Wasiq, M., Khalil Wani, S.: Unveiling Sora open AI\u2019s impact: a review of transformative shifts in marketing and advertising employment. Cogent Bus. Manag. 12(1), 2440640 (2025)","journal-title":"Cogent Bus. Manag."},{"issue":"25","key":"14_CR5","doi-asserted-by":"publisher","first-page":"35619","DOI":"10.1007\/s11042-021-11878-w","volume":"81","author":"V Jain","year":"2022","unstructured":"Jain, V., Al-Turjman, F., Chaudhary, G., Nayar, D., Gupta, V., Kumar, A.: Retracted article: video captioning: a review of theory, techniques and practices. Multimedia Tools Appl. 81(25), 35619\u201335653 (2022)","journal-title":"Multimedia Tools Appl."},{"key":"14_CR6","doi-asserted-by":"crossref","unstructured":"Abdar, M., et al.: A review of deep learning for video captioning. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3522295"},{"key":"14_CR7","doi-asserted-by":"crossref","unstructured":"Zhou, X., et al.: Streaming dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18243\u201318252 (2024)","DOI":"10.1109\/CVPR52733.2024.01727"},{"issue":"1","key":"14_CR8","doi-asserted-by":"publisher","first-page":"e12785","DOI":"10.1002\/eng2.12785","volume":"6","author":"MS Wajid","year":"2024","unstructured":"Wajid, M.S., Terashima-Marin, H., Najafirad, P., Wajid, M.A.: Deep learning and knowledge graph for image\/video captioning: a review of datasets, evaluation metrics, and methods. Eng. Rep. 6(1), e12785 (2024)","journal-title":"Eng. Rep."},{"key":"14_CR9","unstructured":"Chen, L., et al.: ShareGPT4Video: improving video understanding and generation with better captions. arXiv preprint arXiv:2406.04325 (2024)"},{"issue":"2","key":"14_CR10","doi-asserted-by":"publisher","first-page":"e12920","DOI":"10.1111\/exsy.12920","volume":"42","author":"S Varma","year":"2025","unstructured":"Varma, S., James, D.P.: Retracted: an efficient deep learning-based video captioning framework using multi-modal features. Expert. Syst. 42(2), e12920 (2025)","journal-title":"Expert. Syst."},{"key":"14_CR11","doi-asserted-by":"publisher","first-page":"111138","DOI":"10.1016\/j.patcog.2024.111138","volume":"159","author":"F Yuan","year":"2025","unstructured":"Yuan, F., Gu, S., Zhang, X., Fang, Z.: Fully exploring object relation interaction and hidden state attention for video captioning. Pattern Recogn. 159, 111138 (2025)","journal-title":"Pattern Recogn."},{"key":"14_CR12","unstructured":"Adilkhanov, A., et al..: Survey on vision-language-action models. arXiv preprint arXiv:2502.06851 (2025)"},{"key":"14_CR13","unstructured":"Yang, Z., et al.: CogVideoX: text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:2408.06072 (2024)"},{"key":"14_CR14","doi-asserted-by":"crossref","unstructured":"Malaviya, P., Patel, D., Bharti, S.: Video captioning using large language models. In: INOCON 2024, pp. 1\u20137. IEEE (2024)","DOI":"10.1109\/INOCON60754.2024.10512233"},{"key":"14_CR15","unstructured":"Yang, D., et al: VRiPT: a video is worth thousands of words. Adv. Neural Inf. Process. Syst. 37, 57240\u201357261 (2025)"},{"key":"14_CR16","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., Liu, W.: Reconstruction network for video captioning. In: CVPR, pp. 7622\u20137631 (2018)","DOI":"10.1109\/CVPR.2018.00795"},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Iashin, V., Rahtu, E.: Multi-modal dense video captioning. In: CVPR Workshops, pp. 958\u2013959 (2020)","DOI":"10.1109\/CVPRW50498.2020.00487"},{"issue":"9","key":"14_CR18","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","volume":"19","author":"L Gao","year":"2017","unstructured":"Gao, L., Guo, Z., Zhang, H., Xu, X., Shen, H.T.: Video captioning with attention-based LSTM and semantic consistency. IEEE Trans. Multimedia 19(9), 2045\u20132055 (2017)","journal-title":"IEEE Trans. Multimedia"},{"issue":"11","key":"14_CR19","doi-asserted-by":"publisher","first-page":"5552","DOI":"10.1109\/TIP.2019.2916757","volume":"28","author":"B Zhao","year":"2019","unstructured":"Zhao, B., Li, X., Lu, X.: CAM-RNN: Co-attention model based RNN for video captioning. IEEE Trans. Image Process. 28(11), 5552\u20135565 (2019)","journal-title":"IEEE Trans. Image Process."},{"issue":"1","key":"14_CR20","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1007\/s11220-022-00400-7","volume":"23","author":"KP Deorukhkar","year":"2022","unstructured":"Deorukhkar, K.P., Ket, S.: Image captioning using hybrid LSTM-RNN with deep features. Sens. Imaging 23(1), 31 (2022)","journal-title":"Sens. Imaging"},{"key":"14_CR21","doi-asserted-by":"publisher","first-page":"109204","DOI":"10.1016\/j.patcog.2022.109204","volume":"136","author":"Y Tu","year":"2023","unstructured":"Tu, Y., Zhou, C., Guo, J., Li, H., Gao, S., Yu, Z.: Relation-aware attention for video captioning via graph learning. Pattern Recogn. 136, 109204 (2023)","journal-title":"Pattern Recogn."},{"key":"14_CR22","doi-asserted-by":"publisher","first-page":"109906","DOI":"10.1016\/j.patcog.2023.109906","volume":"145","author":"X Luo","year":"2024","unstructured":"Luo, X., et al.: Global semantic enhancement network for video captioning. Pattern Recogn. 145, 109906 (2024)","journal-title":"Pattern Recogn."},{"key":"14_CR23","doi-asserted-by":"crossref","unstructured":"Umirzakova, S., Muksimova, S., Mardieva, S., Sultanov, M., Cho, Y.I: MIRA-CAP: memory-integrated retrieval-augmented captioning for state-of-the-art image and video captioning. Sensors 24(24), 8013 (2024)","DOI":"10.3390\/s24248013"},{"key":"14_CR24","doi-asserted-by":"crossref","unstructured":"Islam, M.M., et al.: Video recap: recursive captioning of hour-long videos. In: CVPR, pp. 18198\u201318208 (2024)","DOI":"10.1109\/CVPR52733.2024.01723"},{"key":"14_CR25","doi-asserted-by":"crossref","unstructured":"Aravind, R., Ashwin, G., Srinivasan, N.: AI enhanced video sequence description generator. In: ADICS 2024, pp. 1\u20136. IEEE (2024)","DOI":"10.1109\/ADICS58448.2024.10533487"},{"key":"14_CR26","doi-asserted-by":"crossref","unstructured":"Baskar, P., Arigela, P., Meda, S.S., Thushar, M., Singh, D..: Video captioning using LSTM-based encoder-decoder architecture. In: INOCON 2024, pp. 1\u20136. IEEE (2024)","DOI":"10.1109\/INOCON60754.2024.10511580"},{"key":"14_CR27","doi-asserted-by":"crossref","unstructured":"Naik, D., CD, J.: Video captioning using sentence vector-enabled convolutional framework with short-connected LSTM. Multimedia Tools Appl. 83(4), 11187\u201311213 (2024)","DOI":"10.1007\/s11042-023-15978-7"},{"key":"14_CR28","doi-asserted-by":"crossref","unstructured":"Muhammed Kunju, A.K., et al.: A transformer-based real-time photo captioning framework for visually impaired people with visual attention. Multimedia Tools Appl. 1\u201320 (2024)","DOI":"10.1007\/s11042-024-18966-7"},{"key":"14_CR29","doi-asserted-by":"crossref","unstructured":"Muksimova, S., Umirzakova, S., Sultanov, M., Im Cho, Y.: Cross-modal transformer-based streaming dense video captioning with neural ODE temporal localization. Sensors 25(3), 707 (2025)","DOI":"10.3390\/s25030707"},{"key":"14_CR30","doi-asserted-by":"crossref","unstructured":"Yousif, A.J., Al-Jammas, M.H.: Real-time Arabic video captioning using CNN and transformer networks based on parallel implementation. Diyala J. Eng. Sci. 84\u201393 (2024)","DOI":"10.24237\/djes.2024.17108"},{"key":"14_CR31","doi-asserted-by":"crossref","unstructured":"Khan, S., Teeti, I., Bradley, A., Elhoseiny, M., Cuzzolin, F.: A hybrid graph network for complex activity detection in video. In: WACV, pp. 6762\u20136772 (2024)","DOI":"10.1109\/WACV57701.2024.00662"},{"key":"14_CR32","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.: Towards automatic learning of procedures from web instructional videos. In: AAAI Conference on Artificial Intelligence, New Orleans, LA, USA (2018)","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"14_CR33","doi-asserted-by":"crossref","unstructured":"Plummer, B., Wang, L., Cervantes, C., et al.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: ICCV, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"14_CR34","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Escorcia, V., Ghanem, B., Carlos Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR, Boston, MA, USA (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"14_CR35","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: BLEU: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318. Philadelphia, PA, USA (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"14_CR36","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: ACL Workshop on Evaluation Measures, Ann Arbor, MI, USA (2005)"},{"key":"14_CR37","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR, pp. 4566\u20134575. Boston, MA, USA (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"14_CR38","unstructured":"Lin, C.-Y.: ROUGE: A package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381. Barcelona, Spain (2004)"},{"key":"14_CR39","doi-asserted-by":"crossref","unstructured":"Pan, B., Cai, H., Huang, D.-A., Lee, K.-H., Gaidon, A., Adeli, E., et al.: Spatio-temporal graph for video captioning with knowledge distillation. In: CVPR, pp. 10870\u201310879 (2020)","DOI":"10.1109\/CVPR42600.2020.01088"},{"issue":"5","key":"14_CR40","doi-asserted-by":"publisher","first-page":"456","DOI":"10.1049\/cvi2.12103","volume":"16","author":"X Ping","year":"2022","unstructured":"Ping, X., Zhou, B.: Exploring the spatio-temporal aware graph for video captioning. IET Comput. Vis.Comput. Vis. 16(5), 456\u2013467 (2022)","journal-title":"IET Comput. Vis.Comput. Vis."}],"container-title":["Lecture Notes in Computer Science","Computer Information Systems and Industrial Management"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02406-0_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T20:21:58Z","timestamp":1757449318000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02406-0_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,24]]},"ISBN":["9783032024053","9783032024060"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02406-0_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8,24]]},"assertion":[{"value":"24 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CISIM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Computer Information Systems and Industrial Management","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Fukuoka","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cisim2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/pb.edu.pl\/cisim\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}