{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T23:22:42Z","timestamp":1761952962602,"version":"build-2065373602"},"publisher-location":"Singapore","reference-count":28,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819533978","type":"print"},{"value":"9789819533985","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3398-5_1","type":"book-chapter","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T23:19:20Z","timestamp":1761952760000},"page":"3-14","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["SRG-Net: Semantic Relation-Guided Network for\u00a0Commonsense Video Captioning"],"prefix":"10.1007","author":[{"given":"Zeyu","family":"Xi","sequence":"first","affiliation":[]},{"given":"Yijie","family":"Li","sequence":"additional","affiliation":[]},{"given":"Haoying","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Haoran","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Lifang","family":"Wu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,1]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: Soundnet: learning sound representations from unlabeled video. In: Advances in Neural Information Processing Systems, vol. 29 (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"1_CR2","unstructured":"Ayyubi, H.A., Liu, T., Nagrani, A., et\u00a0al.: Video summarization: towards entity-aware captions. arXiv preprint arXiv:2312.02188 (2023)"},{"key":"1_CR3","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"1_CR4","doi-asserted-by":"crossref","unstructured":"Chen, S., Jiang, Y.G.: Motion guided region message passing for video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (ICCV), pp. 1543\u20131552 (2021)","DOI":"10.1109\/ICCV48922.2021.00157"},{"key":"1_CR5","unstructured":"Devlin, J., Chang, M.W., et\u00a0al.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186 (2019)"},{"key":"1_CR6","doi-asserted-by":"crossref","unstructured":"Fang, Z., Gokhale, T., et\u00a0al.: Video2commonsense: generating commonsense descriptions to enrich video captioning. arXiv preprint arXiv:2003.05162 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.61"},{"issue":"9","key":"1_CR7","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","volume":"19","author":"L Gao","year":"2017","unstructured":"Gao, L., Guo, Z., Zhang, H., et al.: Video captioning with attention-based LSTM and semantic consistency. IEEE Trans. Multimedia 19(9), 2045\u20132055 (2017)","journal-title":"IEEE Trans. Multimedia"},{"key":"1_CR8","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et\u00a0al.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1_CR9","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"1_CR10","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima, A., Tamura, T., Fukunaga, K.: Natural language description of human activities from video images based on concept hierarchy of actions. Int. J. Comput. Vision 50, 171\u2013184 (2002)","journal-title":"Int. J. Comput. Vision"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Krishnamoorthy, N., Malkarnenkar, G., Mooney, R., et\u00a0al.: Generating natural-language video descriptions using text-mined knowledge. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a027, pp. 541\u2013547 (2013)","DOI":"10.1609\/aaai.v27i1.8679"},{"key":"1_CR12","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Lin, K., Li, L., Lin, C.C., et\u00a0al.: Swinbert: end-to-end transformers with sparse attention for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17949\u201317958 (2022)","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"1_CR14","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., et\u00a0al.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (ACL), pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1_CR15","doi-asserted-by":"crossref","unstructured":"Sap, M., Le\u00a0Bras, R., Allaway, E., et\u00a0al.: Atomic: an atlas of machine commonsense for if-then reasoning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 3027\u20133035 (2019)","DOI":"10.1609\/aaai.v33i01.33013027"},{"key":"1_CR16","doi-asserted-by":"crossref","unstructured":"Shao, H., Fang, Z., Yang, Y.: Cavan: commonsense knowledge anchored video captioning. In: 2022 26th International Conference on Pattern Recognition (ICPR), pp. 4095\u20134102. IEEE (2022)","DOI":"10.1109\/ICPR56361.2022.9956241"},{"key":"1_CR17","doi-asserted-by":"crossref","unstructured":"Shen, Y., Gu, X., Xu, K., et\u00a0al.: Accurate and fast compressed video captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15558\u201315567 (2023)","DOI":"10.1109\/ICCV51070.2023.01426"},{"key":"1_CR18","doi-asserted-by":"crossref","unstructured":"Tang, M., Wang, Z., et\u00a0al.: Clip4caption: clip for video caption. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 4858\u20134862 (2021)","DOI":"10.1145\/3474085.3479207"},{"key":"1_CR19","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et\u00a0al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1_CR21","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., et\u00a0al.: Sequence to sequence-video to text. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"1_CR22","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.129177","volume":"619","author":"Z Xi","year":"2025","unstructured":"Xi, Z., Shi, G., Li, X., et al.: A simple yet effective knowledge guided method for entity-aware video captioning on a basketball benchmark. Neurocomputing 619, 129177 (2025)","journal-title":"Neurocomputing"},{"key":"1_CR23","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2025.126906","volume":"274","author":"Z Xi","year":"2025","unstructured":"Xi, Z., Shi, G., Sun, H., et al.: Eika: explicit & implicit knowledge-augmented network for entity-aware sports video captioning. Expert Syst. Appl. 274, 126906 (2025)","journal-title":"Expert Syst. Appl."},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., et\u00a0al.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Ye, H., Li, G., Qi, Y., et\u00a0al.: Hierarchical modular network for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17939\u201317948 (2022)","DOI":"10.1109\/CVPR52688.2022.01741"},{"key":"1_CR26","doi-asserted-by":"crossref","unstructured":"Yu, W., Liang, J., Ji, L., et\u00a0al.: Hybrid reasoning network for video-based commonsense captioning. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 5213\u20135221 (2021)","DOI":"10.1145\/3474085.3475638"},{"key":"1_CR27","doi-asserted-by":"publisher","first-page":"5147","DOI":"10.1109\/TMM.2023.3330070","volume":"26","author":"M Yuan","year":"2023","unstructured":"Yuan, M., Jia, G., Bao, B.K.: GPT-based knowledge guiding network for commonsense video captioning. IEEE Trans. Multimedia 26, 5147\u20135158 (2023)","journal-title":"IEEE Trans. Multimedia"},{"key":"1_CR28","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Y., Corso, J.J., et\u00a0al.: End-to-end dense video captioning with masked transformer. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8739\u20138748 (2018)","DOI":"10.1109\/CVPR.2018.00911"}],"container-title":["Lecture Notes in Computer Science","Image and Graphics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3398-5_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T23:19:23Z","timestamp":1761952763000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3398-5_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,1]]},"ISBN":["9789819533978","9789819533985"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3398-5_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,1]]},"assertion":[{"value":"1 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIG","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Image and Graphics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Xuzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icig2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icig.csig.org.cn\/2025\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}