{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T12:42:49Z","timestamp":1742992969675,"version":"3.40.3"},"publisher-location":"Cham","reference-count":48,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031198380"},{"type":"electronic","value":"9783031198397"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19839-7_5","type":"book-chapter","created":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T11:40:06Z","timestamp":1666438806000},"page":"70-87","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Actor-Centered Representations for\u00a0Action Localization in\u00a0Streaming Videos"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1062-8929","authenticated-orcid":false,"given":"Sathyanarayanan","family":"Aakur","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7332-4207","authenticated-orcid":false,"given":"Sudeep","family":"Sarkar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,10,23]]},"reference":[{"key":"5_CR1","unstructured":"Aakur, S., de Souza, F.D., Sarkar, S.: Going deeper with semantics: exploiting semantic contextualization for interpretation of human activity in videos. In: IEEE Winter Conference on Applications of Computer Vision (WACV). IEEE (2019)"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Aakur, S.N., Sarkar, S.: A perceptual prediction framework for self supervised event segmentation. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00129"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Aakur, S.N., Sarkar, S.: Action localization through continual predictive learning. arXiv preprint arXiv:2003.12185 (2020)","DOI":"10.1007\/978-3-030-58568-6_18"},{"key":"5_CR4","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"key":"5_CR5","unstructured":"Choi, W., Shahid, K., Savarese, S.: What are they doing?: collective activity classification using spatio-temporal relationship among people. In: 2009 IEEE 12th International Conference on Computer Vision Workshops, pp. 1282\u20131289. IEEE (2009)"},{"key":"5_CR6","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2019.102886","volume":"192","author":"V Escorcia","year":"2020","unstructured":"Escorcia, V., Dao, C.D., Jain, M., Ghanem, B., Snoek, C.: Guess where? actor-supervision for spatiotemporal action localization. Comput. Vis. Image Underst. 192, 102886 (2020)","journal-title":"Comput. Vis. Image Underst."},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"Gan, C., Gong, B., Liu, K., Su, H., Guibas, L.J.: Geometry guided convolutional neural networks for self-supervised video representation learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5589\u20135597 (2018)","DOI":"10.1109\/CVPR.2018.00586"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Gavrilyuk, K., Sanford, R., Javan, M., Snoek, C.G.: Actor-transformers for group activity recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 839\u2013848 (2020)","DOI":"10.1109\/CVPR42600.2020.00092"},{"key":"5_CR9","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., Malik, J.: Finding action tubes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 759\u2013768 (2015)","DOI":"10.1109\/CVPR.2015.7298676"},{"issue":"8","key":"5_CR10","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"issue":"3","key":"5_CR11","doi-asserted-by":"publisher","first-page":"743","DOI":"10.3758\/s13423-014-0723-1","volume":"22","author":"G Horstmann","year":"2015","unstructured":"Horstmann, G., Herwig, A.: Surprise attracts the eyes and binds the gaze. Psychon. Bull. Rev. 22(3), 743\u2013749 (2015)","journal-title":"Psychon. Bull. Rev."},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Hou, R., Chen, C., Shah, M.: Tube convolutional neural network (t-CNN) for action detection in videos. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 5822\u20135831 (2017)","DOI":"10.1109\/ICCV.2017.620"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Ibrahim, M.S., Muralidharan, S., Deng, Z., Vahdat, A., Mori, G.: A hierarchical deep temporal model for group activity recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1971\u20131980 (2016)","DOI":"10.1109\/CVPR.2016.217"},{"issue":"3","key":"5_CR14","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1007\/s11263-017-1023-9","volume":"124","author":"M Jain","year":"2017","unstructured":"Jain, M., Van Gemert, J., J\u00e9gou, H., Bouthemy, P., Snoek, C.G.: Tubelets: unsupervised action proposals from spatiotemporal super-voxels. Int. J. Comput. Vision 124(3), 287\u2013311 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Jhuang, H., Gall, J., Zuffi, S., Schmid, C., Black, M.J.: Towards understanding action recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3192\u20133199 (2013)","DOI":"10.1109\/ICCV.2013.396"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Ji, X., Henriques, J.F., Vedaldi, A.: Invariant information clustering for unsupervised image classification and segmentation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 9865\u20139874 (2019)","DOI":"10.1109\/ICCV.2019.00996"},{"key":"5_CR17","unstructured":"Jiang, Y.G., et al.: Thumos challenge: action recognition with a large number of classes (2014)"},{"key":"5_CR18","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Arslan, A., Serre, T.: The language of actions: recovering the syntax and semantics of goal-directed human activities. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 780\u2013787 (2014)","DOI":"10.1109\/CVPR.2014.105"},{"key":"5_CR19","doi-asserted-by":"crossref","unstructured":"Lan, T., Wang, Y., Mori, G.: Discriminative figure-centric models for joint action localization and recognition. In: 2011 International Conference on Computer Vision, pp. 2003\u20132010. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126472"},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"Li, S., et al.: Groupformer: group activity recognition with clustered spatial-temporal transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13668\u201313677 (2021)","DOI":"10.1109\/ICCV48922.2021.01341"},{"key":"5_CR21","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1016\/j.cviu.2017.10.011","volume":"166","author":"Z Li","year":"2018","unstructured":"Li, Z., Gavrilyuk, K., Gavves, E., Jain, M., Snoek, C.G.: Videolstm convolves, attends and flows for action recognition. Comput. Vis. Image Underst. 166, 41\u201350 (2018)","journal-title":"Comput. Vis. Image Underst."},{"key":"5_CR22","unstructured":"Lin, M., Chen, Q., Yan, S.: Network in network. arXiv preprint arXiv:1312.4400 (2013)"},{"key":"5_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/978-3-319-46448-0_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"W Liu","year":"2016","unstructured":"Liu, W., et al.: SSD: single shot multibox detector. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 21\u201337. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_2"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Liu, Y., Tu, Z., Lin, L., Xie, X., Qin, Q.: Real-time spatio-temporal action localization via learning motion representation. In: Proceedings of the Asian Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-69756-3_13"},{"key":"5_CR25","doi-asserted-by":"crossref","unstructured":"Luong, M.T., Pham, H., Manning, C.D.: Effective approaches to attention-based neural machine translation. arXiv preprint arXiv:1508.04025 (2015)","DOI":"10.18653\/v1\/D15-1166"},{"key":"5_CR26","doi-asserted-by":"crossref","unstructured":"Pan, J., Chen, S., Shou, M.Z., Liu, Y., Shao, J., Li, H.: Actor-context-actor relation network for spatio-temporal action localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 464\u2013474 (2021)","DOI":"10.1109\/CVPR46437.2021.00053"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Pramono, R.R.A., Chen, Y.T., Fang, W.H.: Hierarchical self-attention network for action localization in videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 61\u201370 (2019)","DOI":"10.1109\/ICCV.2019.00015"},{"key":"5_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-01249-6_7","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M Qi","year":"2018","unstructured":"Qi, M., Qin, J., Li, A., Wang, Y., Luo, J., Van Gool, L.: stagNet: an attentive semantic RNN for group activity recognition. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11214, pp. 104\u2013120. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01249-6_7"},{"key":"5_CR29","doi-asserted-by":"crossref","unstructured":"Redmon, J., Farhadi, A.: Yolo9000: better, faster, stronger. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7263\u20137271 (2017)","DOI":"10.1109\/CVPR.2017.690"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Rodriguez, M.D., Ahmed, J., Shah, M.: Action MACH a spatio-temporal maximum average correlation height filter for action recognition. In: 2008 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20138. IEEE (2008)","DOI":"10.1109\/CVPR.2008.4587727"},{"issue":"3","key":"5_CR31","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vis. (IJCV) 115(3), 211\u2013252 (2015). https:\/\/doi.org\/10.1007\/s11263-015-0816-y","journal-title":"Int. J. Comput. Vis. (IJCV)"},{"key":"5_CR32","unstructured":"Sharma, S., Kiros, R., Salakhutdinov, R.: Action recognition using visual attention. In: Neural Information Processing Systems: Time Series Workshop (2015)"},{"key":"5_CR33","doi-asserted-by":"crossref","unstructured":"Shu, T., Todorovic, S., Zhu, S.C.: CERN: confidence-energy recurrent network for group activity recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5523\u20135531 (2017)","DOI":"10.1109\/CVPR.2017.453"},{"key":"5_CR34","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"5_CR35","doi-asserted-by":"crossref","unstructured":"Soomro, K., Idrees, H., Shah, M.: Action localization in videos through context walk. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3280\u20133288 (2015)","DOI":"10.1109\/ICCV.2015.375"},{"key":"5_CR36","doi-asserted-by":"crossref","unstructured":"Soomro, K., Idrees, H., Shah, M.: Predicting the where and what of actors and actions through online action localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2648\u20132657 (2016)","DOI":"10.1109\/CVPR.2016.290"},{"key":"5_CR37","doi-asserted-by":"crossref","unstructured":"Soomro, K., Shah, M.: Unsupervised action discovery and localization in videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 696\u2013705 (2017)","DOI":"10.1109\/ICCV.2017.82"},{"key":"5_CR38","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"5_CR39","doi-asserted-by":"crossref","unstructured":"Tian, Y., Sukthankar, R., Shah, M.: Spatiotemporal deformable part models for action detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2642\u20132649 (2013)","DOI":"10.1109\/CVPR.2013.341"},{"key":"5_CR40","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"5_CR41","unstructured":"Tran, D., Yuan, J.: Max-margin structured output regression for spatio-temporal action localization. In: Advances in neural information processing systems, pp. 350\u2013358 (2012)"},{"key":"5_CR42","doi-asserted-by":"crossref","unstructured":"Wang, J., Jiao, J., Bao, L., He, S., Liu, Y., Liu, W.: Self-supervised spatio-temporal representation learning for videos by predicting motion and appearance statistics. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4006\u20134015 (2019)","DOI":"10.1109\/CVPR.2019.00413"},{"key":"5_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"565","DOI":"10.1007\/978-3-319-10602-1_37","volume-title":"Computer Vision \u2013 ECCV 2014","author":"L Wang","year":"2014","unstructured":"Wang, L., Qiao, Yu., Tang, X.: Video action detection with relational dynamic-poselets. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 565\u2013580. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_37"},{"key":"5_CR44","doi-asserted-by":"crossref","unstructured":"Wang, M., Ni, B., Yang, X.: Recurrent modeling of interaction context for collective activity recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3048\u20133056 (2017)","DOI":"10.1109\/CVPR.2017.783"},{"key":"5_CR45","doi-asserted-by":"crossref","unstructured":"Weinzaepfel, P., Harchaoui, Z., Schmid, C.: Learning to track for spatio-temporal action localization. In: Proceedings of the IEEE international conference on computer vision, pp. 3164\u20133172 (2015)","DOI":"10.1109\/ICCV.2015.362"},{"key":"5_CR46","doi-asserted-by":"crossref","unstructured":"Wu, J., Wang, L., Wang, L., Guo, J., Wu, G.: Learning actor relation graphs for group activity recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9964\u20139974 (2019)","DOI":"10.1109\/CVPR.2019.01020"},{"key":"5_CR47","unstructured":"Xie, J., Girshick, R., Farhadi, A.: Unsupervised deep embedding for clustering analysis. In: International Conference on Machine Learning (ICML), pp. 478\u2013487 (2016)"},{"key":"5_CR48","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107312","volume":"103","author":"D Zhang","year":"2020","unstructured":"Zhang, D., He, L., Tu, Z., Zhang, S., Han, F., Yang, B.: Learning motion representation for real-time spatio-temporal action localization. Pattern Recogn. 103, 107312 (2020)","journal-title":"Pattern Recogn."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19839-7_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T10:09:59Z","timestamp":1728209399000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19839-7_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198380","9783031198397"],"references-count":48,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19839-7_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"23 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}