{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T16:35:38Z","timestamp":1764174938637,"version":"3.40.3"},"publisher-location":"Cham","reference-count":69,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030585259"},{"type":"electronic","value":"9783030585266"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58526-6_5","type":"book-chapter","created":{"date-parts":[[2020,10,6]],"date-time":"2020-10-06T21:03:07Z","timestamp":1602018187000},"page":"71-90","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":19,"title":["Representation Learning on Visual-Symbolic Graphs for Video Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7552-8342","authenticated-orcid":false,"given":"Effrosyni","family":"Mavroudi","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9705-4483","authenticated-orcid":false,"given":"Benjam\u00edn B\u00e9jar","family":"Haro","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1838-0761","authenticated-orcid":false,"given":"Ren\u00e9","family":"Vidal","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,10,7]]},"reference":[{"key":"5_CR1","doi-asserted-by":"crossref","unstructured":"Assari, S.M., Zamir, A.R., Shah, M.: Video classification using semantic concept co-occurrences. In: IEEE Conference on Computer Vision and Pattern Recognition (2014)","DOI":"10.1109\/CVPR.2014.324"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Bajaj, M., Wang, L., Sigal, L.: G3raphground: Graph-based language grounding. In: IEEE International Conference on Computer Vision, pp. 4281\u20134290 (2019)","DOI":"10.1109\/ICCV.2019.00438"},{"key":"5_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1007\/978-3-030-01261-8_7","volume-title":"Computer Vision \u2013 ECCV 2018","author":"F Baradel","year":"2018","unstructured":"Baradel, F., Neverova, N., Wolf, C., Mille, J., Mori, G.: Object level visual reasoning in videos. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11217, pp. 106\u2013122. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01261-8_7"},{"key":"5_CR4","doi-asserted-by":"publisher","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 4724\u20134733 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.502","DOI":"10.1109\/CVPR.2017.502"},{"key":"5_CR5","doi-asserted-by":"publisher","unstructured":"Chen, X., Li, L., Fei-Fei, L., Gupta, A.: Iterative visual reasoning beyond convolutions. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 7239\u20137248 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00756","DOI":"10.1109\/CVPR.2018.00756"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Y., Rohrbach, M., Yan, Z., Shuicheng, Y., Feng, J., Kalantidis, Y.: Graph-based global reasoning networks. In: IEEE Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00052"},{"key":"5_CR7","doi-asserted-by":"publisher","unstructured":"Ch\u00e9ron, G., Laptev, I., Schmid, C.: P-CNN: pose-based CNN features for action recognition. In: IEEE International Conference on Computer Vision, pp. 3218\u20133226 (2015). https:\/\/doi.org\/10.1109\/ICCV.2015.368","DOI":"10.1109\/ICCV.2015.368"},{"key":"5_CR8","doi-asserted-by":"publisher","unstructured":"Choi, M.J., Lim, J.J., Torralba, A., Willsky, A.S.: Exploiting hierarchical context on a large database of object categories. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 129\u2013136 (2010). https:\/\/doi.org\/10.1109\/CVPR.2010.5540221","DOI":"10.1109\/CVPR.2010.5540221"},{"key":"5_CR9","doi-asserted-by":"publisher","unstructured":"Dave, A., Russakovsky, O., Ramanan, D.: Predictive-corrective networks for action detection. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 2067\u20132076 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.223","DOI":"10.1109\/CVPR.2017.223"},{"key":"5_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1007\/978-3-319-10590-1_4","volume-title":"Computer Vision \u2013 ECCV 2014","author":"J Deng","year":"2014","unstructured":"Deng, J., et al.: Large-scale object classification using label relation graphs. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8689, pp. 48\u201364. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10590-1_4"},{"key":"5_CR11","doi-asserted-by":"publisher","unstructured":"Deng, Z., Vahdat, A., Hu, H., Mori, G.: Structure inference machines: recurrent neural networks for analyzing relations in group activity recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 4772\u20134781 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.516","DOI":"10.1109\/CVPR.2016.516"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Ghosh, P., Yao, Y., Davis, L., Divakaran, A.: Stacked spatio-temporal graph convolutional networks for action segmentation. In: IEEE Winter Applications of Computer Vision Conference (2020)","DOI":"10.1109\/WACV45572.2020.9093361"},{"key":"5_CR13","unstructured":"Gilmer, J., Schoenholz, S.S., Riley, P.F., Vinyals, O., Dahl, G.E.: Neural message passing for quantum chemistry. In: International Conference on Machine Learning, pp. 1263\u20131272 (2017)"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Carreira, J., Doersch, C., Zisserman, A.: Video action transformer network. In: IEEE Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00033"},{"key":"5_CR15","doi-asserted-by":"publisher","unstructured":"Gkioxari, G., Girshick, R., Malik, J.: Contextual action recognition with R*CNN. In: IEEE International Conference on Computer Vision, pp. 1080\u20131088 (2015). https:\/\/doi.org\/10.1109\/ICCV.2015.129","DOI":"10.1109\/ICCV.2015.129"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Gong, L., Cheng, Q.: Exploiting edge features for graph neural networks. In: IEEE Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00943"},{"key":"5_CR17","doi-asserted-by":"publisher","unstructured":"He, K., Gkioxari, G., Dollar, P., Girshick, R.: Mask R-CNN. IEEE Trans. Pattern Anal. Mach. Intell., 1 (2018). https:\/\/doi.org\/10.1109\/TPAMI.2018.2844175","DOI":"10.1109\/TPAMI.2018.2844175"},{"key":"5_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Delving deep into rectifiers: surpassing human-level performance on ImageNet classification. In: IEEE International Conference on Computer Vision (2015)","DOI":"10.1109\/ICCV.2015.123"},{"key":"5_CR19","unstructured":"Huang, H., Zhou, L., Zhang, W., Xu, C.: Dynamic graph modules for modeling higher-order interactions in activity recognition. In: British Machine Vision Conference (2019)"},{"key":"5_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"742","DOI":"10.1007\/978-3-030-01219-9_44","volume-title":"Computer Vision \u2013 ECCV 2018","author":"MS Ibrahim","year":"2018","unstructured":"Ibrahim, M.S., Mori, G.: Hierarchical relational networks for group activity recognition and retrieval. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11207, pp. 742\u2013758. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01219-9_44"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Jain, A., Zamir, A.R., Savarese, S., Saxena, A.: Structural-RNN: deep learning on spatio-temporal graphs. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 5308\u20135317 (2016)","DOI":"10.1109\/CVPR.2016.573"},{"key":"5_CR22","unstructured":"Jiang, C., Xu, H., Liang, X., Lin, L.: Hybrid knowledge routed modules for large-scale object detection. In: Neural Information Processing Systems, pp. 1552\u20131563 (2018)"},{"issue":"2","key":"5_CR23","doi-asserted-by":"publisher","first-page":"352","DOI":"10.1109\/TPAMI.2017.2670560","volume":"40","author":"YG Jiang","year":"2018","unstructured":"Jiang, Y.G., Wu, Z., Wang, J., Xue, X., Chang, S.F.: Exploiting feature and class relationships in video categorization with regularized deep neural networks. IEEE Trans. Pattern Anal. Mach. Intell. 40(2), 352\u2013364 (2018). https:\/\/doi.org\/10.1109\/TPAMI.2017.2670560","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"5_CR24","doi-asserted-by":"publisher","unstructured":"Junior, N.I.N., Hu, H., Zhou, G., Deng, Z., Liao, Z., Mori, G.: Structured label inference for visual understanding. IEEE Trans. Pattern Anal. Mach. Intell., 1 (2019). https:\/\/doi.org\/10.1109\/TPAMI.2019.2893215","DOI":"10.1109\/TPAMI.2019.2893215"},{"key":"5_CR25","unstructured":"Kipf, T.N., Welling, M.: Semi-supervised classification with graph convolutional networks. In: International Conference on Learning Representations (2017)"},{"key":"5_CR26","unstructured":"Koller, D., et al.: Towards robust automatic traffic scene analysis in real-time. In: IEEE Conference on Computer Vision and Pattern Recognition (1994)"},{"issue":"8","key":"5_CR27","doi-asserted-by":"publisher","first-page":"951","DOI":"10.1177\/0278364913478446","volume":"32","author":"HS Koppula","year":"2013","unstructured":"Koppula, H.S., Gupta, R., Saxena, A.: Learning human activities and object affordances from RGB-D videos. Int. J. Rob. Res. 32(8), 951\u2013970 (2013). https:\/\/doi.org\/10.1177\/0278364913478446","journal-title":"Int. J. Rob. Res."},{"key":"5_CR28","doi-asserted-by":"publisher","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","DOI":"10.1007\/s11263-016-0981-7"},{"key":"5_CR29","doi-asserted-by":"publisher","unstructured":"Lea, C., Flynn, M.D., Vidal, R., Reiter, A., Hager, G.D.: Temporal convolutional networks for action segmentation and detection. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1003\u20131012 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.113","DOI":"10.1109\/CVPR.2017.113"},{"key":"5_CR30","doi-asserted-by":"publisher","unstructured":"Lee, C., Fang, W., Yeh, C., Wang, Y.F.: Multi-label zero-shot learning with structured knowledge graphs. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1576\u20131585 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00170","DOI":"10.1109\/CVPR.2018.00170"},{"key":"5_CR31","doi-asserted-by":"crossref","unstructured":"Li, R., Tapaswi, M., Liao, R., Jia, J., Urtasun, R., Fidler, S.: Situation recognition with graph neural networks. In: IEEE International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.448"},{"key":"5_CR32","unstructured":"Li, Y., Gupta, A.: Beyond grids: learning graph representations for visual recognition. In: Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., Garnett, R. (eds.) Neural Information Processing Systems, pp. 9225\u20139235 (2018)"},{"key":"5_CR33","unstructured":"Liang, X., Hu, Z., Zhang, H., Lin, L., Xing, E.P.: Symbolic graph reasoning meets convolutions. In: Neural Information Processing Systems, pp. 1853\u20131863. Curran Associates, Inc. (2018)"},{"key":"5_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"5_CR35","doi-asserted-by":"publisher","unstructured":"Liu, J., Kuipers, B., Savarese, S.: Recognizing human actions by attributes. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 3337\u20133344 (2011). https:\/\/doi.org\/10.1109\/CVPR.2011.5995353","DOI":"10.1109\/CVPR.2011.5995353"},{"key":"5_CR36","doi-asserted-by":"publisher","unstructured":"Ma, C., Kadav, A., Melvin, I., Kira, Z., AlRegib, G., Graf, H.P.: Attend and interact: higher-order object interactions for video understanding. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 6790\u20136800 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00710","DOI":"10.1109\/CVPR.2018.00710"},{"key":"5_CR37","doi-asserted-by":"publisher","unstructured":"Marszalek, M., Schmid, C.: Semantic hierarchies for visual object recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20137 (2007). https:\/\/doi.org\/10.1109\/CVPR.2007.383272","DOI":"10.1109\/CVPR.2007.383272"},{"key":"5_CR38","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"479","DOI":"10.1007\/978-3-540-88693-8_35","volume-title":"Computer Vision \u2013 ECCV 2008","author":"M Marsza\u0142ek","year":"2008","unstructured":"Marsza\u0142ek, M., Schmid, C.: Constructing category hierarchies for visual recognition. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008. LNCS, vol. 5305, pp. 479\u2013491. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-88693-8_35"},{"key":"5_CR39","doi-asserted-by":"publisher","unstructured":"Mavroudi, E., Tao, L., Vidal, R.: Deep moving poselets for video based action recognition. In: IEEE Winter Applications of Computer Vision Conference, pp. 111\u2013120 (2017). https:\/\/doi.org\/10.1109\/WACV.2017.20","DOI":"10.1109\/WACV.2017.20"},{"key":"5_CR40","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: Neural Information Processing Systems, pp. 3111\u20133119 (2013)"},{"key":"5_CR41","unstructured":"Nicolicioiu, A., Duta, I., Leordeanu, M.: Recurrent space-time graph neural networks. In: Neural Information Processing Systems (2019)"},{"issue":"12","key":"5_CR42","doi-asserted-by":"publisher","first-page":"520","DOI":"10.1016\/j.tics.2007.09.009","volume":"11","author":"A Oliva","year":"2007","unstructured":"Oliva, A., Torralba, A.: The role of context in object recognition. Trends Cogn. Sci. 11(12), 520\u2013527 (2007). https:\/\/doi.org\/10.1016\/j.tics.2007.09.009","journal-title":"Trends Cogn. Sci."},{"key":"5_CR43","doi-asserted-by":"publisher","unstructured":"Piergiovanni, A., Ryoo, M.S.: Learning latent super-events to detect multiple activities in videos. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 5304\u20135313 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00556","DOI":"10.1109\/CVPR.2018.00556"},{"key":"5_CR44","unstructured":"Piergiovanni, A.J., Ryoo, M.S.: Temporal gaussian mixture layer for videos. In: International Conference on Machine learning (2019)"},{"key":"5_CR45","doi-asserted-by":"crossref","unstructured":"Prest, A., Ferrari, V., Schmid, C.: Explicit modeling of human-object interactions in realistic videos. IEEE Trans. Pattern Anal. Mach. Intell. (2013)","DOI":"10.1109\/TPAMI.2012.175"},{"key":"5_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"407","DOI":"10.1007\/978-3-030-01240-3_25","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Qi","year":"2018","unstructured":"Qi, S., Wang, W., Jia, B., Shen, J., Zhu, S.-C.: Learning human-object interactions by graph parsing neural networks. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11213, pp. 407\u2013423. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01240-3_25"},{"key":"5_CR47","doi-asserted-by":"publisher","unstructured":"Ramanathan, V., et al.: Learning semantic relationships for better action retrieval in images. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1100\u20131109 (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7298713","DOI":"10.1109\/CVPR.2015.7298713"},{"key":"5_CR48","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"593","DOI":"10.1007\/978-3-319-93417-4_38","volume-title":"The Semantic Web","author":"M Schlichtkrull","year":"2018","unstructured":"Schlichtkrull, M., Kipf, T.N., Bloem, P., van\u00a0den Berg, R., Titov, I., Welling, M.: Modeling relational data with graph convolutional networks. In: Gangemi, A., et al. (eds.) ESWC 2018. LNCS, vol. 10843, pp. 593\u2013607. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-93417-4_38"},{"key":"5_CR49","doi-asserted-by":"publisher","unstructured":"Sigurdsson, G.A., Divvala, S., Farhadi, A., Gupta, A.: Asynchronous temporal fields for action recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 5650\u20135659 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.599","DOI":"10.1109\/CVPR.2017.599"},{"key":"5_CR50","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"5_CR51","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Ghahramani, Z., Welling, M., Cortes, C., Lawrence, N.D., Weinberger, K.Q. (eds.) Neural Information Processing Systems, pp. 568\u2013576. Curran Associates, Inc. (2014)"},{"key":"5_CR52","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/978-3-030-01252-6_20","volume-title":"Computer Vision \u2013 ECCV 2018","author":"C Sun","year":"2018","unstructured":"Sun, C., et al.: Actor-centric relation network. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11215, pp. 335\u2013351. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01252-6_20"},{"key":"5_CR53","doi-asserted-by":"crossref","unstructured":"Teney, D., Liu, L., van den Hengel, A.: Graph-structured representations for visual question answering. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20139 (2017)","DOI":"10.1109\/CVPR.2017.344"},{"key":"5_CR54","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: IEEE International Conference on Computer Vision (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"5_CR55","unstructured":"Veli\u010dkovi\u0107, P., Cucurull, G., Casanova, A., Romero, A., Li\u00f2, P., Bengio, Y.: Graph attention networks. In: International Conference on Learning Representations (2018)"},{"key":"5_CR56","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"5_CR57","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"413","DOI":"10.1007\/978-3-030-01228-1_25","volume-title":"Computer Vision \u2013 ECCV 2018","author":"X Wang","year":"2018","unstructured":"Wang, X., Gupta, A.: Videos as space-time region graphs. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11209, pp. 413\u2013431. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01228-1_25"},{"key":"5_CR58","doi-asserted-by":"crossref","unstructured":"Wang, X., Ji, Q.: Video event recognition with deep hierarchical context model. In: IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7299071"},{"key":"5_CR59","doi-asserted-by":"crossref","unstructured":"Xiong, Y., Huang, Q., Guo, L., Zhou, H., Zhou, B., Lin, D.: A graph-based framework to bridge movies and synopses. In: IEEE International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00469"},{"key":"5_CR60","doi-asserted-by":"publisher","unstructured":"Xu, H., Das, A., Saenko, K.: R-C3d: region convolutional 3D network for temporal activity detection. In: IEEE International Conference on Computer Vision, pp. 5794\u20135803 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.617","DOI":"10.1109\/ICCV.2017.617"},{"key":"5_CR61","doi-asserted-by":"crossref","unstructured":"Yatskar, M., Zettlemoyer, L., Farhadi, A.: Situation recognition: visual semantic role labeling for image understanding. In: IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.597"},{"key":"5_CR62","doi-asserted-by":"publisher","unstructured":"Yuan, Y., Liang, X., Wang, X., Yeung, D., Gupta, A.: Temporal dynamic graph LSTM for action-driven video object detection. In: IEEE International Conference on Computer Vision, pp. 1819\u20131828 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.200","DOI":"10.1109\/ICCV.2017.200"},{"key":"5_CR63","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Tokmakov, P., Hebert, M., Schmid, C.: A structured model for action detection. In: IEEE Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.01021"},{"key":"5_CR64","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"831","DOI":"10.1007\/978-3-030-01246-5_49","volume-title":"Computer Vision \u2013 ECCV 2018","author":"B Zhou","year":"2018","unstructured":"Zhou, B., Andonian, A., Oliva, A., Torralba, A.: Temporal relational reasoning in videos. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11205, pp. 831\u2013846. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_49"},{"key":"5_CR65","doi-asserted-by":"crossref","unstructured":"Zhou, L., Kalantidis, Y., Chen, X., Corso, J.J., Rohrbach, M.: Grounded video description. In: IEEE Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00674"},{"key":"5_CR66","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Y., Corso, J.J., Socher, R., Xiong, C.: End-to-end dense video captioning with masked transformer. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 8739\u20138748 (2018)","DOI":"10.1109\/CVPR.2018.00911"},{"key":"5_CR67","doi-asserted-by":"publisher","unstructured":"Zhou, Y., Ni, B., Tian, Q.: Interaction part mining: a mid-level approach for fine-grained action recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 3323\u20133331 (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7298953","DOI":"10.1109\/CVPR.2015.7298953"},{"key":"5_CR68","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Nayak, N.M., Roy-Chowdhury, A.K.: Context-aware modeling and recognition of activities in video. In: IEEE Conference on Computer Vision and Pattern Recognition (2013)","DOI":"10.1109\/CVPR.2013.322"},{"key":"5_CR69","doi-asserted-by":"crossref","unstructured":"Zitnik, M., Agrawal, M., Leskovec, J.: Modeling polypharmacy side effects with graph convolutional networks. Bioinformatics, 457\u2013466 (2018)","DOI":"10.1093\/bioinformatics\/bty294"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58526-6_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T00:05:41Z","timestamp":1728173141000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58526-6_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585259","9783030585266"],"references-count":69,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58526-6_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"7 October 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic. From the ECCV Workshops 249 full papers, 18 short papers, and 21 further contributions were published out of a total of 467 submissions.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}