{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T19:12:31Z","timestamp":1743102751854,"version":"3.40.3"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031204968"},{"type":"electronic","value":"9783031204975"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20497-5_40","type":"book-chapter","created":{"date-parts":[[2022,12,16]],"date-time":"2022-12-16T12:09:06Z","timestamp":1671192546000},"page":"490-501","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Spatial-Temporal Contextual Feature Fusion Network for\u00a0Movie Description"],"prefix":"10.1007","author":[{"given":"Yihui","family":"Liao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lu","family":"Fan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huiming","family":"Ding","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhifeng","family":"Xie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,12,17]]},"reference":[{"key":"40_CR1","unstructured":"Banerjee, S., Lavie, A.: Meteor: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"40_CR2","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"issue":"5","key":"40_CR3","first-page":"1112","volume":"42","author":"L Gao","year":"2019","unstructured":"Gao, L., Li, X., Song, J., Shen, H.T.: Hierarchical LSTMs with adaptive attention for visual captioning. IEEE Trans. Pattern Anal. Mach. Intell. 42(5), 1112\u20131131 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"40_CR4","doi-asserted-by":"crossref","unstructured":"Han, S.H., Go, B.W., Choi, H.J.: Multiple videos captioning model for video storytelling. In: 2019 IEEE International Conference on Big Data and Smart Computing (BigComp), pp. 1\u20134. IEEE (2019)","DOI":"10.1109\/BIGCOMP.2019.8679213"},{"key":"40_CR5","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"40_CR6","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"40_CR7","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"40_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1007\/978-3-030-01216-8_12","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Dhruv Mahajan","year":"2018","unstructured":"Mahajan, Dhruv, Girshick, Ross, Ramanathan, Vignesh, He, Kaiming, Paluri, Manohar, Li, Yixuan, Bharambe, Ashwin, van der Maaten, Laurens: Exploring the limits of weakly supervised pretraining. In: Ferrari, Vittorio, Hebert, Martial, Sminchisescu, Cristian, Weiss, Yair (eds.) ECCV 2018. LNCS, vol. 11206, pp. 185\u2013201. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01216-8_12"},{"key":"40_CR9","doi-asserted-by":"crossref","unstructured":"Mi, L., Chen, Z.: Hierarchical graph attention network for visual relationship detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13886\u201313895 (2020)","DOI":"10.1109\/CVPR42600.2020.01390"},{"key":"40_CR10","doi-asserted-by":"crossref","unstructured":"Pan, P., Xu, Z., Yang, Y., Wu, F., Zhuang, Y.: Hierarchical recurrent neural encoder for video representation with application to captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1029\u20131038 (2016)","DOI":"10.1109\/CVPR.2016.117"},{"issue":"8","key":"40_CR11","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"40_CR12","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"issue":"1","key":"40_CR13","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1007\/s11263-016-0987-1","volume":"123","author":"A Rohrbach","year":"2017","unstructured":"Rohrbach, A., et al.: Movie description. Int. J. Comput. Vision 123(1), 94\u2013120 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"40_CR14","doi-asserted-by":"publisher","unstructured":"Ronfard, R., Thuong, T.: A framework for aligning and indexing movies with their script. In: 2003 Proceedings of International Conference on Multimedia and Expo. ICME 2003 (Cat. No. 03TH8698), vol. 1, pp. 1\u201321 (2003). https:\/\/doi.org\/10.1109\/ICME.2003.1220844","DOI":"10.1109\/ICME.2003.1220844"},{"issue":"1","key":"40_CR15","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1109\/TNN.2008.2005605","volume":"20","author":"F Scarselli","year":"2008","unstructured":"Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE Trans. Neural Netw. 20(1), 61\u201380 (2008)","journal-title":"IEEE Trans. Neural Netw."},{"key":"40_CR16","unstructured":"Shetty, R., Laaksonen, J.: Video captioning with recurrent networks based on frame-and video-level features and visual content classification. arXiv preprint arXiv:1512.02949 (2015)"},{"key":"40_CR17","doi-asserted-by":"crossref","unstructured":"Song, L., Smola, A., Gretton, A., Borgwardt, K.M., Bedo, J.: Supervised feature selection via dependence estimation. In: Proceedings of the 24th International Conference on Machine Learning, pp. 823\u2013830 (2007)","DOI":"10.1145\/1273496.1273600"},{"key":"40_CR18","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Bauml, M., Stiefelhagen, R.: Book2movie: aligning video scenes with book chapters. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1827\u20131835 (2015)","DOI":"10.1109\/CVPR.2015.7298792"},{"key":"40_CR19","unstructured":"Torabi, A., Pal, C., Larochelle, H., Courville, A.: Using descriptive video services to create a large data source for video annotation research. arXiv preprint arXiv:1503.01070 (2015)"},{"key":"40_CR20","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"40_CR21","unstructured":"Veli\u010dkovi\u0107, P., Cucurull, G., Casanova, A., Romero, A., Lio, P., Bengio, Y.: Graph attention networks. arXiv preprint arXiv:1710.10903 (2017)"},{"key":"40_CR22","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence-video to text. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"40_CR23","doi-asserted-by":"publisher","first-page":"327","DOI":"10.1016\/j.patrec.2018.07.024","volume":"130","author":"H Wang","year":"2020","unstructured":"Wang, H., Gao, C., Han, Y.: Sequence in sequence for video captioning. Pattern Recogn. Lett. 130, 327\u2013334 (2020)","journal-title":"Pattern Recogn. Lett."},{"key":"40_CR24","doi-asserted-by":"crossref","unstructured":"Wang, J., Bao, B., Xu, C.: Dualvgr: A dual-visual graph reasoning unit for video question answering. IEEE Trans. Multimed. (2021)","DOI":"10.1109\/TMM.2021.3097171"},{"key":"40_CR25","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated residual transformations for deep neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1492\u20131500 (2017)","DOI":"10.1109\/CVPR.2017.634"},{"key":"40_CR26","doi-asserted-by":"publisher","unstructured":"Xie, Z., Zhang, W., Sheng, B., Li, P., Chen, C.L.P.: BagFN: broad attentive graph fusion network for high-order feature interactions. IEEE Trans. Neural Netw. Learn. Syst. 1\u201315 (2021). https:\/\/doi.org\/10.1109\/TNNLS.2021.3116209","DOI":"10.1109\/TNNLS.2021.3116209"},{"key":"40_CR27","doi-asserted-by":"crossref","unstructured":"Yu, Y., Chung, J., Yun, H., Kim, J., Kim, G.: Transitional adaptation of pretrained models for visual storytelling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12658\u201312668 (2021)","DOI":"10.1109\/CVPR46437.2021.01247"},{"key":"40_CR28","doi-asserted-by":"crossref","unstructured":"Yu, Y., Ko, H., Choi, J., Kim, G.: End-to-end concept word detection for video captioning, retrieval, and question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3165\u20133173 (2017)","DOI":"10.1109\/CVPR.2017.347"},{"key":"40_CR29","doi-asserted-by":"publisher","first-page":"663","DOI":"10.1109\/LSP.2021.3066349","volume":"28","author":"R Zhong","year":"2021","unstructured":"Zhong, R., Wang, R., Zou, Y., Hong, Z., Hu, M.: Graph attention networks adjusted Bi-LSTM for video summarization. IEEE Signal Process. Lett. 28, 663\u2013667 (2021)","journal-title":"IEEE Signal Process. Lett."},{"key":"40_CR30","doi-asserted-by":"crossref","unstructured":"Zhou, W., Xia, Z., Dou, P., Su, T., Hu, H.: Double attention based on graph attention network for image multi-label classification. ACM Trans. Multimed. Comput. Commun. App. (TOMM) (2022)","DOI":"10.1145\/3519030"}],"container-title":["Lecture Notes in Computer Science","Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20497-5_40","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,16]],"date-time":"2022-12-16T12:23:13Z","timestamp":1671193393000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20497-5_40"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031204968","9783031204975"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20497-5_40","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"17 December 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CICAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CAAI International Conference on Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Beijing","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 August 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cicai2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/cicai.caai.cn\/#\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"472","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"164","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"35% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.1","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}