{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T18:10:04Z","timestamp":1748196604159,"version":"3.41.0"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031919886","type":"print"},{"value":"9783031919893","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-91989-3_18","type":"book-chapter","created":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T17:33:20Z","timestamp":1748194400000},"page":"281-294","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards Real-Time Online Egocentric Action Recognition on\u00a0Smart Eyewear"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-6344-408X","authenticated-orcid":false,"given":"Riccardo","family":"Santambrogio","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Federico","family":"Caspani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Greta","family":"Corti","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1862-4853","authenticated-orcid":false,"given":"Francesca","family":"Palermo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7059-413X","authenticated-orcid":false,"given":"Simone","family":"Mentasti","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8935-5593","authenticated-orcid":false,"given":"Diana","family":"Trojaniello","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8306-6739","authenticated-orcid":false,"given":"Matteo","family":"Matteucci","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"18_CR1","doi-asserted-by":"crossref","unstructured":"An, J., Kang, H., Han, S.H., Yang, M.H., Kim, S.J.: Miniroad: minimal rnn framework for online action detection. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00949"},{"key":"18_CR2","unstructured":"An, J., Park, Y., Kang, H., Kim, S.J.: Object aware egocentric online action detection (2024). https:\/\/arxiv.org\/abs\/2406.01079"},{"key":"18_CR3","doi-asserted-by":"crossref","unstructured":"Ashutosh, K., Girdhar, R., Torresani, L., Grauman, K.: Hiervl: learning hierarchical video-language embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 23066\u201323078 (June 2023)","DOI":"10.1109\/CVPR52729.2023.02209"},{"key":"18_CR4","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, 18\u201324 Jul, vol.\u00a0139, pp. 813\u2013824. PMLR (2021), https:\/\/proceedings.mlr.press\/v139\/bertasius21a.html"},{"key":"18_CR5","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using rnn encoder-decoder for statistical machine translation (2014). https:\/\/arxiv.org\/abs\/1406.1078","DOI":"10.3115\/v1\/D14-1179"},{"key":"18_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1007\/978-3-030-01225-0_44","volume-title":"Computer Vision \u2013 ECCV 2018","author":"D Damen","year":"2018","unstructured":"Damen, D., et al.: Scaling egocentric vision: the dataset. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11208, pp. 753\u2013771. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01225-0_44"},{"key":"18_CR7","doi-asserted-by":"publisher","unstructured":"Damen, D., et al.: Rescaling egocentric vision: Collection, pipeline and challenges for epic-kitchens-100. Inter. J. Compute. Vis. (IJCV) 130, 33-55 (2022). https:\/\/doi.org\/10.1007\/s11263-021-01531-2","DOI":"10.1007\/s11263-021-01531-2"},{"key":"18_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"269","DOI":"10.1007\/978-3-319-46454-1_17","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R De Geest","year":"2016","unstructured":"De Geest, R., Gavves, E., Ghodrati, A., Li, Z., Snoek, C., Tuytelaars, T.: Online action detection. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 269\u2013284. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_17"},{"key":"18_CR9","doi-asserted-by":"publisher","unstructured":"Eun, H., Moon, J., Park, J., Jung, C., Kim, C.: Learning to discriminate information for online action detection. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 806\u2013815 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00089","DOI":"10.1109\/CVPR42600.2020.00089"},{"key":"18_CR10","doi-asserted-by":"crossref","unstructured":"Gao, J., Yang, Z., Nevatia, R.: Red: reinforced encoder-decoder networks for action anticipation (2017). https:\/\/arxiv.org\/abs\/1707.04818","DOI":"10.5244\/C.31.92"},{"key":"18_CR11","doi-asserted-by":"crossref","unstructured":"Geest, R.D., Tuytelaars, T.: Modeling temporal structure with lstm for online action detection. 2018 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 1549\u20131557 (2018), https:\/\/api.semanticscholar.org\/CorpusID:21788692","DOI":"10.1109\/WACV.2018.00173"},{"key":"18_CR12","unstructured":"Grauman, K., et al.: Ego4d: around the world in 3,000 hours of egocentric video. In: IEEE\/CVF Computer Vision and Pattern Recognition (CVPR) (2022)"},{"key":"18_CR13","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"18_CR14","doi-asserted-by":"publisher","unstructured":"Hu, X., Dai, J., Li, M., Peng, C., Li, Y., Du, S.: Online human action detection and anticipation in videos: a survey. Neurocomputing 491, 395\u2013413 (2022). https:\/\/doi.org\/10.1016\/j.neucom.2022.03.069, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0925231222003617","DOI":"10.1016\/j.neucom.2022.03.069"},{"key":"18_CR15","doi-asserted-by":"publisher","unstructured":"Idrees, H., et al.: The thumos challenge on action recognition for videos \u201cin the wild\u201d. Comput. Vis. Image Understanding 155, 1\u201323 (2017). https:\/\/doi.org\/10.1016\/j.cviu.2016.10.018, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S1077314216301710","DOI":"10.1016\/j.cviu.2016.10.018"},{"key":"18_CR16","doi-asserted-by":"publisher","unstructured":"Kim, Y.H., Nam, S., Kim, S.J.: Temporally smooth online action detection using cycle-consistent future anticipation. Pattern Recogn. 116, 107954 (2021). https:\/\/doi.org\/10.1016\/j.patcog.2021.107954, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0031320321001412","DOI":"10.1016\/j.patcog.2021.107954"},{"key":"18_CR17","doi-asserted-by":"crossref","unstructured":"Li, Y., Nagarajan, T., Xiong, B., Grauman, K.: Ego-exo: transferring visual representations from third-person to first-person videos. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00687"},{"key":"18_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"639","DOI":"10.1007\/978-3-030-01228-1_38","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Li","year":"2018","unstructured":"Li, Y., Liu, M., Rehg, J.M.: In the eye of beholder: joint learning of gaze and actions in first person video. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11209, pp. 639\u2013655. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01228-1_38"},{"key":"18_CR19","unstructured":"Lin, K.Q., et al.: Egocentric video-language pretraining. In: Oh, A.H., Agarwal, A., Belgrave, D., Cho, K. (eds.) Advances in Neural Information Processing Systems (2022). https:\/\/openreview.net\/forum?id=nE8_DvxAqAB"},{"issue":"2","key":"18_CR20","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1109\/TPAMI.2018.2858826","volume":"42","author":"TY Lin","year":"2020","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. IEEE Trans. Pattern Anal. Mach. Intell. 42(2), 318\u2013327 (2020). https:\/\/doi.org\/10.1109\/TPAMI.2018.2858826","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"18_CR21","doi-asserted-by":"publisher","unstructured":"N\u00fa\u00f1ez-Marcos, A., Azkune, G., Arganda-Carreras, I.: Egocentric vision-based action recognition: A survey. Neurocomputing 472, 175-197 (2022). https:\/\/doi.org\/10.1016\/j.neucom.2021.11.081","DOI":"10.1016\/j.neucom.2021.11.081"},{"key":"18_CR22","unstructured":"van\u00a0den Oord, A., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding (2019). https:\/\/arxiv.org\/abs\/1807.03748"},{"key":"18_CR23","doi-asserted-by":"crossref","unstructured":"Pramanick, S., et al.: Egovlpv2: egocentric video-language pre-training with fusion in the backbone. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 5285\u20135297 (October 2023)","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"18_CR24","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision (2021). https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"18_CR25","doi-asserted-by":"crossref","unstructured":"Ragusa, F., Furnari, A., Livatino, S., Farinella, G.M.: The meccano dataset: understanding human-object interactions from egocentric videos in an industrial-like domain. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 1569\u20131578 (January 2021)","DOI":"10.1109\/WACV48630.2021.00161"},{"key":"18_CR26","unstructured":"Sigurdsson, G.A., Gupta, A., Schmid, C., Farhadi, A., Alahari, K.: Charades-ego: a large-scale dataset of paired third and first person videos (2018). https:\/\/arxiv.org\/abs\/1804.09626"},{"key":"18_CR27","doi-asserted-by":"publisher","unstructured":"Truong, T.D., Luu, K.: Cross-view action recognition understanding from exocentric to egocentric perspective (arXiv:2305.15699) (May 2023). https:\/\/doi.org\/10.48550\/arXiv.2305.15699","DOI":"10.48550\/arXiv.2305.15699"},{"key":"18_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"18_CR29","unstructured":"Wang, S., Li, Z., Zhao, Y., Xiong, Y., Wang, L., Lin, D.: denseflow (2020). https:\/\/github.com\/open-mmlab\/denseflow"},{"key":"18_CR30","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Oadtr: online action detection with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7565\u20137575 (October 2021)","DOI":"10.1109\/ICCV48922.2021.00747"},{"key":"18_CR31","doi-asserted-by":"publisher","unstructured":"Xu, L., et al.: Towards continual egocentric activity recognition: A multi-modal egocentric activity dataset for continual learning. IEEE Trans. Multimedia 26, 2430\u20132443 (2024). https:\/\/doi.org\/10.1109\/TMM.2023.3295899","DOI":"10.1109\/TMM.2023.3295899"},{"key":"18_CR32","doi-asserted-by":"crossref","unstructured":"Xu, M., Gao, M., Chen, Y.T., Davis, L.S., Crandall, D.J.: Temporal recurrent networks for online action detection. In: IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00563"},{"key":"18_CR33","unstructured":"Xu, M., et al.: Long short-term transformer for online action detection. In: Ranzato, M., Beygelzimer, A., Dauphin, Y., Liang, P., Vaughan, J.W. (eds.) Advances in Neural Information Processing Systems, vol.\u00a034, pp. 1086\u20131099. Curran Associates, Inc. (2021), https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2021\/file\/08b255a5d42b89b0585260b6f2360bdd-Paper.pdf"},{"key":"18_CR34","unstructured":"Xue, Z., Grauman, K.: Learning fine-grained view-invariant representations from unpaired ego-exo videos via temporal alignment. In: NeurIPS (2023)"},{"key":"18_CR35","doi-asserted-by":"publisher","unstructured":"Yang, L., Huang, Y., Sugano, Y., Sato, Y.: Interact before align: leveraging cross-modal knowledge for domain adaptive action recognition. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14702\u201314712 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01431","DOI":"10.1109\/CVPR52688.2022.01431"},{"key":"18_CR36","doi-asserted-by":"publisher","unstructured":"Yonetani, R., Kitani, K.M., Sato, Y.: Ego-surfing first person videos. In: Proc. IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2015), pp. 5445\u20135454 (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7299183","DOI":"10.1109\/CVPR.2015.7299183"},{"key":"18_CR37","doi-asserted-by":"publisher","unstructured":"Zhao, Y., Kr\u00e4henb\u00fchl, P.: Real-time online video detection with temporal smoothing transformers. In: Computer Vision - ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXIV. p. 485-502. Springer-Verlag, Berlin, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19830-4_28","DOI":"10.1007\/978-3-031-19830-4_28"},{"key":"18_CR38","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Misra, I., Kr\u00e4henb\u00fchl, P., Girdhar, R.: Learning video representations from large language models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"18_CR39","doi-asserted-by":"publisher","unstructured":"Zhu, X., Dai, J., Yuan, L., Wei, Y.: Towards high performance video object detection. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7210\u20137218 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00753","DOI":"10.1109\/CVPR.2018.00753"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-91989-3_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T17:33:28Z","timestamp":1748194408000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-91989-3_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031919886","9783031919893"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-91989-3_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}