{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T18:51:36Z","timestamp":1760986296377,"version":"3.40.3"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031611360"},{"type":"electronic","value":"9783031611377"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-61137-7_6","type":"book-chapter","created":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T07:10:33Z","timestamp":1717053033000},"page":"55-64","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Exploring Text-Driven Approaches for\u00a0Online Action Detection"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8809-8476","authenticated-orcid":false,"given":"Manuel","family":"Benavent-Lledo","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1712-7265","authenticated-orcid":false,"given":"David","family":"Mulero-P\u00e9rez","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4890-8217","authenticated-orcid":false,"given":"David","family":"Ortiz-Perez","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7798-3055","authenticated-orcid":false,"given":"Jose","family":"Garcia-Rodriguez","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6817-6326","authenticated-orcid":false,"given":"Sergio","family":"Orts-Escolano","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,5,31]]},"reference":[{"key":"6_CR1","doi-asserted-by":"crossref","unstructured":"An, J., Kang, H., Han, S.H., Yang, M.H., Kim, S.J.: Miniroad: minimal RNN framework for online action detection. In: ICCV, pp. 10341\u201310350, October 2023","DOI":"10.1109\/ICCV51070.2023.00949"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: Vivit: a video vision transformer (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"issue":"2","key":"6_CR3","doi-asserted-by":"publisher","first-page":"363","DOI":"10.1007\/s11063-015-9412-y","volume":"43","author":"J Azorin-Lopez","year":"2015","unstructured":"Azorin-Lopez, J., Saval-Calvo, M., Fuster-Guillo, A., Garcia-Rodriguez, J.: A novel prediction method for early recognition of global human behaviour in image sequences. Neural Process. Lett. 43(2), 363\u2013387 (2015)","journal-title":"Neural Process. Lett."},{"key":"6_CR4","doi-asserted-by":"crossref","unstructured":"Azor\u00edn-L\u00f3pez, J., Saval-Calvo, M., Fuster-Guill\u00f3, A., Garc\u00eda-Rodr\u00edguez, J.: Human behaviour recognition based on trajectory analysis using neural networks. In: IJCNN, pp.\u00a01\u20137 (2013)","DOI":"10.1109\/IJCNN.2013.6706724"},{"key":"6_CR5","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Benavent-Lledo, M., Mulero-P\u00e9rez, D., Ortiz-Perez, D., Rodriguez-Juan, J., Berenguer-Agullo, A., Psarrou, A., Garcia-Rodriguez, J.: A comprehensive study on pain assessment from multimodal sensor data. Sensors 23(24) (2023)","DOI":"10.3390\/s23249675"},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"Benavent-Lled\u00f3, M., Oprea, S., Castro-Vargas, J.A., Martinez-Gonzalez, P., Garcia-Rodriguez, J.: Interaction estimation in egocentric videos via simultaneous hand-object recognition. In: SOCO, pp. 439\u2013448 (2022)","DOI":"10.1007\/978-3-030-87869-6_42"},{"key":"6_CR8","doi-asserted-by":"crossref","unstructured":"Benavent-Lledo, M., Oprea, S., Castro-Vargas, J.A., Mulero-Perez, D., Garcia-Rodriguez, J.: Predicting human-object interactions in egocentric videos. In: IJCNN, pp.\u00a01\u20137 (2022)","DOI":"10.1109\/IJCNN55064.2022.9892910"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Cheng, F., Wang, X., Lei, J., Crandall, D., Bansal, M., Bertasius, G.: Vindlu: a recipe for effective video-and-language pretraining (2023)","DOI":"10.1109\/CVPR52729.2023.01034"},{"key":"6_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"269","DOI":"10.1007\/978-3-319-46454-1_17","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R De Geest","year":"2016","unstructured":"De Geest, R., Gavves, E., Ghodrati, A., Li, Z., Snoek, C., Tuytelaars, T.: Online action detection. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 269\u2013284. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_17"},{"key":"6_CR11","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale (2021)"},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"Eun, H., Moon, J., Park, J., Jung, C., Kim, C.: Learning to discriminate information for online action detection. In: CVPR, June 2020","DOI":"10.1109\/CVPR42600.2020.00089"},{"key":"6_CR13","unstructured":"Fl\u00f3rez-Revuelta, F., Garc\u00eda-Chamizo, J.M., Garcia-Rodriguez, J., Hern\u00e1ndez\u00a0S\u00e1ez, A., et\u00a0al.: Representation of 2d objects with a topology preserving network (2002)"},{"key":"6_CR14","doi-asserted-by":"crossref","unstructured":"Gao, J., Yang, Z., Nevatia, R.: Red: Reinforced encoder-decoder networks for action anticipation (2017)","DOI":"10.5244\/C.31.92"},{"key":"6_CR15","doi-asserted-by":"crossref","unstructured":"Gao, M., Zhou, Y., Xu, R., Socher, R., Xiong, C.: Woad: weakly supervised online action detection in untrimmed videos. In: CVPR, pp. 1915\u20131923, June 2021","DOI":"10.1109\/CVPR46437.2021.00195"},{"issue":"7","key":"6_CR16","doi-asserted-by":"publisher","first-page":"4413","DOI":"10.1016\/j.asoc.2011.02.007","volume":"11","author":"J Garc\u00eda-Rodr\u00edguez","year":"2011","unstructured":"Garc\u00eda-Rodr\u00edguez, J., Garc\u00eda-Chamizo, J.M.: Surveillance and human-computer interaction applications of self-growing models. Appl. Soft Comput. 11(7), 4413\u20134431 (2011)","journal-title":"Appl. Soft Comput."},{"key":"6_CR17","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1016\/j.patrec.2017.05.027","volume":"99","author":"F Gomez-Donoso","year":"2017","unstructured":"Gomez-Donoso, F., Orts-Escolano, S., Garcia-Garcia, A., Garcia-Rodriguez, J., Castro-Vargas, J.A., Ovidiu-Oprea, S., Cazorla, M.: A robotic platform for customized and interactive rehabilitation of persons with disabilities. Pattern Recogn. Lett. 99, 105\u2013113 (2017)","journal-title":"Pattern Recogn. Lett."},{"key":"6_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101945","volume":"100","author":"J G\u00f3rriz","year":"2023","unstructured":"G\u00f3rriz, J., \u00c1lvarez Ill\u00e1n, I., \u00c1lvarez Marquina, A., Arco, J., Atzmueller, M., et al.: Computational approaches to explainable artificial intelligence: advances in theory, applications and trends. Inf. Fusion 100, 101945 (2023)","journal-title":"Inf. Fusion"},{"key":"6_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"6_CR20","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: ICML, pp. 448\u2013456 (2015)"},{"key":"6_CR21","unstructured":"Jiang, Y.G., Liu, J., et\u00a0al.: Thumos challenge: Action recognition with a large number of classes (2014)"},{"key":"6_CR22","doi-asserted-by":"crossref","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., Xie, W.: Prompting visual-language models for efficient video understanding (2022)","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Kim, J., Misu, T., Chen, Y.T., Tawari, A., Canny, J.: Grounding human-to-vehicle advice for self-driving vehicles. In: CVPR, June 2019","DOI":"10.1109\/CVPR.2019.01084"},{"key":"6_CR24","doi-asserted-by":"crossref","unstructured":"Li, R., Yan, L., Peng, Y., Qing, L.: Lighter transformer for online action detection, ICIGP 2023, pp. 161\u2013167. Association for Computing Machinery (2023)","DOI":"10.1145\/3582649.3582656"},{"key":"6_CR25","unstructured":"Li, Z., et\u00a0al.: A strong baseline for temporal video-text alignment (2023)"},{"issue":"1","key":"6_CR26","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1016\/j.dcan.2020.05.004","volume":"7","author":"P Ni","year":"2021","unstructured":"Ni, P., Lv, S., Zhu, X., Cao, Q., Zhang, W.: A light-weight on-line action detection with hand trajectories for industrial surveillance. Digital Commun. Networks 7(1), 157\u2013166 (2021)","journal-title":"Digital Commun. Networks"},{"key":"6_CR27","doi-asserted-by":"crossref","unstructured":"Papalampidi, P., et al.: A simple recipe for contrastively pre-training video-first encoders beyond 16 frames (2023)","DOI":"10.1109\/CVPR52733.2024.01364"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A., Kuo, W., Angelova, A.: Rethinking video vits: sparse video tubes for joint image and video learning (2022)","DOI":"10.1109\/CVPR52729.2023.00220"},{"key":"6_CR29","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision (2021)"},{"key":"6_CR30","doi-asserted-by":"crossref","unstructured":"Ramanishka, V., Chen, Y.T., et\u00a0al.: Toward driving scene understanding: a dataset for learning driver behavior and causal reasoning. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00803"},{"issue":"6","key":"6_CR31","doi-asserted-by":"publisher","first-page":"6164","DOI":"10.1109\/JSEN.2022.3148431","volume":"22","author":"L Tong","year":"2022","unstructured":"Tong, L., Ma, H., Lin, Q., He, J., Peng, L.: A novel deep learning bi-gru-i model for real-time human activity recognition using inertial sensors. IEEE Sens. J. 22(6), 6164\u20136174 (2022)","journal-title":"IEEE Sens. J."},{"key":"6_CR32","doi-asserted-by":"publisher","first-page":"138","DOI":"10.1016\/j.neunet.2012.02.014","volume":"32","author":"D Viejo","year":"2012","unstructured":"Viejo, D., Garcia, J., Cazorla, M., Gil, D., Johnsson, M.: Using GNG to improve 3D feature extraction-application to 6DoF egomotion. Neural Netw. 32, 138\u2013146 (2012)","journal-title":"Neural Netw."},{"key":"6_CR33","unstructured":"Wang, M., Xing, J., Liu, Y.: Actionclip: a new paradigm for video action recognition (2021)"},{"key":"6_CR34","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Oadtr: online action detection with transformers. In: ICCV, pp. 7565\u20137575, October 2021","DOI":"10.1109\/ICCV48922.2021.00747"},{"key":"6_CR35","doi-asserted-by":"crossref","unstructured":"Wu, W., Sun, Z., Ouyang, W.: Revisiting classifier: transferring vision-language models for video recognition. In: AAAI Conference, vol.\u00a037, pp. 2847\u20132855 (2023)","DOI":"10.1609\/aaai.v37i3.25386"},{"key":"6_CR36","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: Videoclip: contrastive pre-training for zero-shot video-text understanding (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"6_CR37","doi-asserted-by":"crossref","unstructured":"Xu, M., Gao, M., Chen, Y.T., Davis, L.S., Crandall, D.J.: Temporal recurrent networks for online action detection. In: ICCV, October 2019","DOI":"10.1109\/ICCV.2019.00563"},{"key":"6_CR38","unstructured":"Xu, M., et al.: Long short-term transformer for online action detection. In: NeurIPS (2021)"},{"key":"6_CR39","doi-asserted-by":"crossref","unstructured":"Yang, L., Han, J., Zhang, D.: Colar: effective and efficient online action detection by consulting exemplars. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00316"},{"key":"6_CR40","unstructured":"Zhao, W.X., et\u00a0al.: A survey of large language models (2023)"},{"key":"6_CR41","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Kr\u00e4henb\u00fchl, P.: Real-time online video detection with temporal smoothing transformers. In: European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-19830-4_28"}],"container-title":["Lecture Notes in Computer Science","Bioinspired Systems for Translational Applications: From Robotics to Social Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-61137-7_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T18:10:29Z","timestamp":1732126229000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-61137-7_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031611360","9783031611377"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-61137-7_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"31 May 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"IWINAC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Work-Conference on the Interplay Between Natural and Artificial Computation","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Olh\u00e2o","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Portugal","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 May 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3 June 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iwinac2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iwinac.eu\/iwinac.org\/iwinac2024\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}