{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T08:56:40Z","timestamp":1776070600329,"version":"3.50.1"},"publisher-location":"Cham","reference-count":67,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585471","type":"print"},{"value":"9783030585488","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58548-8_41","type":"book-chapter","created":{"date-parts":[[2020,10,28]],"date-time":"2020-10-28T23:02:42Z","timestamp":1603926162000},"page":"709-727","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":127,"title":["MovieNet: A Holistic Dataset for Movie Understanding"],"prefix":"10.1007","author":[{"given":"Qingqiu","family":"Huang","sequence":"first","affiliation":[]},{"given":"Yu","family":"Xiong","sequence":"additional","affiliation":[]},{"given":"Anyi","family":"Rao","sequence":"additional","affiliation":[]},{"given":"Jiaze","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,10,29]]},"reference":[{"key":"41_CR1","unstructured":"Arandjelovic, O., Zisserman, A.: Automatic face recognition for film character retrieval in feature-length films. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. IEEE (2005)"},{"key":"41_CR2","doi-asserted-by":"crossref","unstructured":"Baraldi, L., Grana, C., Cucchiara, R.: A deep siamese network for scene detection in broadcast videos. In: 23rd ACM International Conference on Multimedia, pp. 1199\u20131202. ACM (2015)","DOI":"10.1145\/2733373.2806316"},{"key":"41_CR3","doi-asserted-by":"crossref","unstructured":"Bauml, M., Tapaswi, M., Stiefelhagen, R.: Semi-supervised learning with constraints for person identification in multimedia data. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2013)","DOI":"10.1109\/CVPR.2013.462"},{"issue":"3","key":"41_CR4","doi-asserted-by":"publisher","first-page":"686","DOI":"10.1109\/TMM.2014.2300833","volume":"16","author":"S Bhattacharya","year":"2014","unstructured":"Bhattacharya, S., Mehran, R., Sukthankar, R., Shah, M.: Classification of cinematographic shots using lie algebra and its application to complex event recognition. IEEE Trans. Multimed. 16(3), 686\u2013696 (2014)","journal-title":"IEEE Trans. Multimed."},{"key":"41_CR5","doi-asserted-by":"crossref","unstructured":"Bojanowski, P., Bach, F., Laptev, I., Ponce, J., Schmid, C., Sivic, J.: Finding actors and actions in movies. In: Proceedings of the IEEE International Conference on Computer Vision (2013)","DOI":"10.1109\/ICCV.2013.283"},{"key":"41_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"628","DOI":"10.1007\/978-3-319-10602-1_41","volume-title":"Computer Vision \u2013 ECCV 2014","author":"P Bojanowski","year":"2014","unstructured":"Bojanowski, P., et al.: Weakly supervised action labeling in videos under ordering constraints. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 628\u2013643. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_41"},{"key":"41_CR7","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Escorcia, V., Ghanem, B., Carlos Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"41_CR8","doi-asserted-by":"crossref","unstructured":"Cai, Z., Vasconcelos, N.: Cascade R-CNN: delving into high quality object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6154\u20136162 (2018)","DOI":"10.1109\/CVPR.2018.00644"},{"key":"41_CR9","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"41_CR10","unstructured":"Cascante-Bonilla, P., Sitaraman, K., Luo, M., Ordonez, V.: Moviescope: Large-scale analysis of movies using multiple modalities. arXiv preprint arXiv:1908.03180 (2019)"},{"key":"41_CR11","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1109\/TMM.2008.2008924","volume":"11","author":"VT Chasanis","year":"2008","unstructured":"Chasanis, V.T., Likas, A.C., Galatsanos, N.P.: Scene detection in videos using shot clustering and sequence alignment. IEEE Trans. Multimed. 11, 89\u2013100 (2008)","journal-title":"IEEE Trans. Multimed."},{"key":"41_CR12","doi-asserted-by":"crossref","unstructured":"Chen, K., et al.: Hybrid task cascade for instance segmentation. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2019","DOI":"10.1109\/CVPR.2019.00511"},{"key":"41_CR13","unstructured":"Chen, K., et al.: MMDetection: open MMLab detection toolbox and benchmark. arXiv preprint arXiv:1906.07155 (2019)"},{"key":"41_CR14","doi-asserted-by":"publisher","first-page":"427","DOI":"10.1007\/s00530-013-0306-4","volume":"19","author":"M Del Fabro","year":"2013","unstructured":"Del Fabro, M., B\u00f6sz\u00f6rmenyi, L.: State-of-the-art and future challenges in video scene detection: a survey. Multimed. Syst. 19, 427\u2013454 (2013). https:\/\/doi.org\/10.1007\/s00530-013-0306-4","journal-title":"Multimed. Syst."},{"key":"41_CR15","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"41_CR16","doi-asserted-by":"crossref","unstructured":"Deng, Z., et al.: R3net: recurrent residual refinement network for saliency detection. In: Proceedings of the 27th International Joint Conference on Artificial Intelligence, pp. 684\u2013690. AAAI Press (2018)","DOI":"10.24963\/ijcai.2018\/95"},{"issue":"4","key":"41_CR17","doi-asserted-by":"publisher","first-page":"743","DOI":"10.1109\/TPAMI.2011.155","volume":"34","author":"P Dollar","year":"2011","unstructured":"Dollar, P., Wojek, C., Schiele, B., Perona, P.: Pedestrian detection: an evaluation of the state of the art. IEEE Trans. Pattern Anal. Mach. Intell. 34(4), 743\u2013761 (2011)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"41_CR18","doi-asserted-by":"crossref","unstructured":"Duchenne, O., Laptev, I., Sivic, J., Bach, F.R., Ponce, J.: Automatic annotation of human actions in video. In: Proceedings of the IEEE International Conference on Computer Vision (2009)","DOI":"10.1109\/ICCV.2009.5459279"},{"key":"41_CR19","doi-asserted-by":"crossref","unstructured":"Everingham, M., Sivic, J., Zisserman, A.: Hello my name is... buffy - automatic naming of characters in TV video. In: BMVC (2006)","DOI":"10.5244\/C.20.92"},{"key":"41_CR20","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"41_CR21","unstructured":"Frome, A., et al.: Devise: a deep visual-semantic embedding model. In: Advances in Neural Information Processing Systems, pp. 2121\u20132129 (2013)"},{"key":"41_CR22","unstructured":"Giannetti, L.D., Leach, J.: Understanding Movies, vol.\u00a01. Prentice Hall Upper Saddle River, New Jersey (1999)"},{"issue":"5\u20136","key":"41_CR23","doi-asserted-by":"publisher","first-page":"602","DOI":"10.1016\/j.neunet.2005.06.042","volume":"18","author":"A Graves","year":"2005","unstructured":"Graves, A., Schmidhuber, J.: Framewise phoneme classification with bidirectional LSTM and other neural network architectures. Neural Networks 18(5\u20136), 602\u2013610 (2005)","journal-title":"Neural Networks"},{"key":"41_CR24","doi-asserted-by":"crossref","unstructured":"Gu, C., et al.: Ava: a video dataset of spatio-temporally localized atomic visual actions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6047\u20136056 (2018)","DOI":"10.1109\/CVPR.2018.00633"},{"key":"41_CR25","unstructured":"Han, B., Wu, W.: Video scene segmentation using a novel boundary evaluation criterion and dynamic programming. In: IEEE International Conference on Multimedia and Expo. IEEE (2011)"},{"key":"41_CR26","doi-asserted-by":"crossref","unstructured":"Haurilet, M.L., Tapaswi, M., Al-Halah, Z., Stiefelhagen, R.: Naming TV characters by watching and analyzing dialogs. In: 2016 IEEE Winter Conference on Applications of Computer Vision (WACV). IEEE (2016)","DOI":"10.1109\/WACV.2016.7477560"},{"key":"41_CR27","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"41_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"437","DOI":"10.1007\/978-3-030-01261-8_26","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Q Huang","year":"2018","unstructured":"Huang, Q., Liu, W., Lin, D.: Person search in videos with one portrait through visual and temporal links. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11217, pp. 437\u2013454. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01261-8_26"},{"key":"41_CR29","doi-asserted-by":"crossref","unstructured":"Huang, Q., Xiong, Y., Lin, D.: Unifying identification and context learning for person recognition. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2018","DOI":"10.1109\/CVPR.2018.00236"},{"key":"41_CR30","unstructured":"Huang, Q., Xiong, Y., Xiong, Y., Zhang, Y., Lin, D.: From trailers to storylines: an efficient way to learn from movies. arXiv preprint arXiv:1806.05341 (2018)"},{"key":"41_CR31","doi-asserted-by":"crossref","unstructured":"Huang, Q., Yang, L., Huang, H., Wu, T., Lin, D.: Caption-supervised face recognition: Training a state-of-the-art face model without manual annotation. In: Proceedings of the European Conference on Computer Vision (ECCV) (2020)","DOI":"10.1007\/978-3-030-58520-4_9"},{"key":"41_CR32","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marsza\u0142ek, M., Schmid, C., Rozenfeld, B.: Learning realistic human actions from movies. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. IEEE Computer Society (2008)","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"41_CR33","doi-asserted-by":"crossref","unstructured":"Li, W., Zhao, R., Xiao, T., Wang, X.: Deepreid: deep filter pairing neural network for person re-identification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 152\u2013159 (2014)","DOI":"10.1109\/CVPR.2014.27"},{"key":"41_CR34","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"41_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"41_CR36","unstructured":"Loy, C.C., et al.: Wider face and pedestrian challenge 2018: Methods and results. arXiv preprint arXiv:1902.06854 (2019)"},{"key":"41_CR37","doi-asserted-by":"crossref","unstructured":"Marsza\u0142ek, M., Laptev, I., Schmid, C.: Actions in context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. IEEE Computer Society (2009)","DOI":"10.1109\/CVPRW.2009.5206557"},{"key":"41_CR38","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)"},{"key":"41_CR39","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Zisserman, A.: From benedict cumberbatch to sherlock holmes: character identification in TV series without a script. BMVC (2017)","DOI":"10.5244\/C.31.107"},{"key":"41_CR40","doi-asserted-by":"crossref","unstructured":"Park, S.B., Kim, H.N., Kim, H., Jo, G.S.: Exploiting script-subtitles alignment to scene boundary detection in movie. In: IEEE International Symposium on Multimedia. IEEE (2010)","DOI":"10.1109\/ISM.2010.17"},{"key":"41_CR41","doi-asserted-by":"crossref","unstructured":"Rao, A., et al.: A unified framework for shot type classification based on subject centric lens. In: Proceedings of the European Conference on Computer Vision (ECCV) (2020)","DOI":"10.1007\/978-3-030-58621-8_2"},{"key":"41_CR42","doi-asserted-by":"crossref","unstructured":"Rao, A., et al.: A local-to-global approach to multi-modal movie scene segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10146\u201310155 (2020)","DOI":"10.1109\/CVPR42600.2020.01016"},{"key":"41_CR43","doi-asserted-by":"publisher","first-page":"1097","DOI":"10.1109\/TMM.2005.858392","volume":"7","author":"Z Rasheed","year":"2005","unstructured":"Rasheed, Z., Shah, M.: Detection and representation of scenes in videos. IEEE Trans. Multimed. 7, 1097\u20131105 (2005)","journal-title":"IEEE Trans. Multimed."},{"key":"41_CR44","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, pp. 91\u201399 (2015)"},{"key":"41_CR45","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"41_CR46","doi-asserted-by":"crossref","unstructured":"Rotman, D., Porat, D., Ashour, G.: Optimal sequential grouping for robust video scene detection using multiple modalities. Int. J. Semant. Comput. 11(02), 193\u2013208 (2017)","DOI":"10.1142\/S1793351X17400086"},{"key":"41_CR47","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1007\/978-3-030-01240-3_13","volume-title":"Computer Vision \u2013 ECCV 2018","author":"D Shao","year":"2018","unstructured":"Shao, D., Xiong, Y., Zhao, Y., Huang, Q., Qiao, Y., Lin, D.: Find and focus: retrieve and localize video events with natural language queries. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11213, pp. 202\u2013218. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01240-3_13"},{"issue":"8","key":"41_CR48","doi-asserted-by":"publisher","first-page":"1163","DOI":"10.1109\/TCSVT.2011.2138830","volume":"21","author":"P Sidiropoulos","year":"2011","unstructured":"Sidiropoulos, P., Mezaris, V., Kompatsiaris, I., Meinedo, H., Bugalho, M., Trancoso, I.: Temporal video segmentation to scenes using high-level audiovisual features. IEEE Trans. Circuits Syst. Video Technol. 21(8), 1163\u20131177 (2011)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"41_CR49","doi-asserted-by":"crossref","unstructured":"Sim\u00f5es, G.S., Wehrmann, J., Barros, R.C., Ruiz, D.D.: Movie genre classification with convolutional neural networks. In: 2016 International Joint Conference on Neural Networks (IJCNN). IEEE (2016)","DOI":"10.1109\/IJCNN.2016.7727207"},{"key":"41_CR50","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"41_CR51","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., B\u00e4uml, M., Stiefelhagen, R.: knock! knock! who is it? probabilistic person identification in TV-series. In: IEEE Conference on Computer Vision and Pattern Recognition. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6247986"},{"key":"41_CR52","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., B\u00e4uml, M., Stiefelhagen, R.: Story-based video retrieval in TV series using plot synopses. In: Proceedings of International Conference on Multimedia Retrieval, p. 137. ACM (2014)","DOI":"10.1145\/2578726.2578727"},{"key":"41_CR53","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., B\u00e4uml, M., Stiefelhagen, R.: Aligning plot synopses to videos for story-based retrieval. Int. J. Multimed. Inf. Retrieval 4(1), 3\u201316 (2015)","DOI":"10.1007\/s13735-014-0065-9"},{"key":"41_CR54","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., Fidler, S.: MovieQA: understanding stories in movies through question-answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.501"},{"key":"41_CR55","doi-asserted-by":"crossref","unstructured":"Vicol, P., Tapaswi, M., Castrejon, L., Fidler, S.: MovieGraphs: towards understanding human-centric situations from videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00895"},{"issue":"10","key":"41_CR56","doi-asserted-by":"publisher","first-page":"1529","DOI":"10.1109\/TCSVT.2009.2022705","volume":"19","author":"HL Wang","year":"2009","unstructured":"Wang, H.L., Cheong, L.F.: Taxonomy of directing semantics for film shot classification. IEEE Trans. Circuits Syst. Video Technol. 19(10), 1529\u20131542 (2009)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"41_CR57","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"41_CR58","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"174","DOI":"10.1007\/978-3-030-58610-2_11","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Xia","year":"2020","unstructured":"Xia, J., Rao, A., Huang, Q., Xu, L., Wen, J., Lin, D.: Online multi-modal person search in videos. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12357, pp. 174\u2013190. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58610-2_11"},{"key":"41_CR59","doi-asserted-by":"crossref","unstructured":"Xiong, Y., Huang, Q., Guo, L., Zhou, H., Zhou, B., Lin, D.: A graph-based framework to bridge movies and synopses. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4592\u20134601 (2019)","DOI":"10.1109\/ICCV.2019.00469"},{"key":"41_CR60","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"41_CR61","doi-asserted-by":"crossref","unstructured":"Xu, M., et al.: Using context saliency for movie shot classification. In: 2011 18th IEEE International Conference on Image Processing, pp. 3653\u20133656. IEEE (2011)","DOI":"10.1109\/ICIP.2011.6116510"},{"key":"41_CR62","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1007\/978-3-540-76390-1_11","volume-title":"Computer Vision \u2013 ACCV 2007","author":"Y Yang","year":"2007","unstructured":"Yang, Y., Lin, S., Zhang, Y., Tang, S.: Statistical framework for shot segmentation and classification in sports video. In: Yagi, Y., Kang, S.B., Kweon, I.S., Zha, H. (eds.) ACCV 2007. LNCS, vol. 4844, pp. 106\u2013115. Springer, Heidelberg (2007). https:\/\/doi.org\/10.1007\/978-3-540-76390-1_11"},{"key":"41_CR63","unstructured":"Zhao, Y., Xiong, Y., Lin, D.: Mmaction (2019). https:\/\/github.com\/open-mmlab\/mmaction"},{"key":"41_CR64","doi-asserted-by":"crossref","unstructured":"Zheng, L., Shen, L., Tian, L., Wang, S., Wang, J., Tian, Q.: Scalable person re-identification: a benchmark. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1116\u20131124 (2015)","DOI":"10.1109\/ICCV.2015.133"},{"key":"41_CR65","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"831","DOI":"10.1007\/978-3-030-01246-5_49","volume-title":"Computer Vision \u2013 ECCV 2018","author":"B Zhou","year":"2018","unstructured":"Zhou, B., Andonian, A., Oliva, A., Torralba, A.: Temporal relational reasoning in videos. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11205, pp. 831\u2013846. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_49"},{"key":"41_CR66","doi-asserted-by":"crossref","unstructured":"Zhou, H., Hermans, T., Karandikar, A.V., Rehg, J.M.: Movie genre classification via scene categorization. In: Proceedings of the 18th ACM International Conference on Multimedia, pp. 747\u2013750. ACM (2010)","DOI":"10.1145\/1873951.1874068"},{"key":"41_CR67","doi-asserted-by":"crossref","unstructured":"Zhu, Y., et al.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 19\u201327 (2015)","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58548-8_41","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:17:02Z","timestamp":1730161022000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58548-8_41"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585471","9783030585488"],"references-count":67,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58548-8_41","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"29 October 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}