{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,30]],"date-time":"2025-10-30T17:38:22Z","timestamp":1761845902235,"version":"3.40.3"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031732287"},{"type":"electronic","value":"9783031732294"}],"license":[{"start":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T00:00:00Z","timestamp":1729814400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T00:00:00Z","timestamp":1729814400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73229-4_15","type":"book-chapter","created":{"date-parts":[[2024,10,24]],"date-time":"2024-10-24T15:03:09Z","timestamp":1729782189000},"page":"254-270","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Context-Aware Action Recognition: Introducing a\u00a0Comprehensive Dataset for\u00a0Behavior Contrast"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2913-3188","authenticated-orcid":false,"given":"Tatsuya","family":"Sasaki","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8813-9458","authenticated-orcid":false,"given":"Yoshiki","family":"Ito","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4941-4920","authenticated-orcid":false,"given":"Satoshi","family":"Kondo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,25]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Barekatain, M., et al.: Okutama-action: an aerial view video dataset for concurrent human action detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 28\u201335 (2017)","DOI":"10.1109\/CVPRW.2017.267"},{"key":"15_CR2","doi-asserted-by":"crossref","unstructured":"Ben-Shabat, Y., et\u00a0al.: The IKEA ASM dataset: understanding people assembling furniture through actions, objects and pose. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 847\u2013859 (2021)","DOI":"10.1109\/WACV48630.2021.00089"},{"key":"15_CR3","doi-asserted-by":"crossref","unstructured":"Blank, M., Gorelick, L., Shechtman, E., Irani, M., Basri, R.: Actions as space-time shapes. In: The Tenth IEEE International Conference on Computer Vision (ICCV 2005), pp. 1395\u20131402 (2005)","DOI":"10.1109\/ICCV.2005.28"},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Chung, J., Wuu, C.H., Yang, H.R., Tai, Y.W., Tang, C.K.: HAA500: human-centric atomic action dataset with curated videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13465\u201313474 (2021)","DOI":"10.1109\/ICCV48922.2021.01321"},{"key":"15_CR5","unstructured":"MMA Contributors: Openmmlab\u2019s next generation video understanding toolbox and benchmark (2020). https:\/\/github.com\/open-mmlab\/mmaction2"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Das, S., et al.: Toyota smarthome: real-world activities of daily living. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 833\u2013842 (2019)","DOI":"10.1109\/ICCV.2019.00092"},{"issue":"12","key":"15_CR7","doi-asserted-by":"publisher","first-page":"9703","DOI":"10.1109\/TPAMI.2021.3127885","volume":"44","author":"S Das","year":"2021","unstructured":"Das, S., Dai, R., Yang, D., Bremond, F.: VPN++: rethinking video-pose embeddings for understanding activities of daily living. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 9703\u20139717 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"15_CR8","unstructured":"Esser, P., et al.: Scaling rectified flow transformers for high-resolution image synthesis (2024). https:\/\/arxiv.org\/abs\/2403.03206"},{"key":"15_CR9","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5842\u20135850 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Gu, C., et al.: Ava: a video dataset of spatio-temporally localized atomic visual actions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6047\u20136056 (2018)","DOI":"10.1109\/CVPR.2018.00633"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Hussein, N., Gavves, E., Smeulders, A.W.: Timeception for complex action recognition. In: CVPR, pp. 254\u2013263 (2019)","DOI":"10.1109\/CVPR.2019.00034"},{"key":"15_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109505","volume":"140","author":"S Kapoor","year":"2023","unstructured":"Kapoor, S., Sharma, A., Verma, A., Singh, S.: Aeriform in-action: a novel dataset for human action recognition in aerial videos. Pattern Recogn. 140, 109505 (2023)","journal-title":"Pattern Recogn."},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR, pp. 1725\u20131732 (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"15_CR14","doi-asserted-by":"publisher","unstructured":"Kay, W., et al.: The kinetics human action video dataset (2017). https:\/\/doi.org\/10.48550\/ARXIV.1705.06950","DOI":"10.48550\/ARXIV.1705.06950"},{"issue":"5","key":"15_CR15","doi-asserted-by":"publisher","first-page":"1366","DOI":"10.1007\/s11263-022-01594-9","volume":"130","author":"Y Kong","year":"2022","unstructured":"Kong, Y., Fu, Y.: Human action recognition and prediction: a survey. Int. J. Comput. Vision 130(5), 1366\u20131401 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"15_CR16","doi-asserted-by":"publisher","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: 2011 International Conference on Computer Vision, pp. 2556\u20132563 (2011). https:\/\/doi.org\/10.1109\/ICCV.2011.6126543","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"15_CR17","unstructured":"Li, K., et al.: Uniformer: unified transformer for efficient spatiotemporal representation learning. arXiv preprint arXiv:2201.04676 (2022)"},{"key":"15_CR18","unstructured":"Li, K., et al.: UniFormerV2: spatiotemporal learning by arming image ViTs with video uniformer. arXiv preprint arXiv:2211.09552 (2022)"},{"key":"15_CR19","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Mvitv2: improved multiscale vision transformers for classification and detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4804\u20134814 (2022)","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"15_CR20","doi-asserted-by":"crossref","unstructured":"Li, Y., Chen, L., He, R., Wang, Z., Wu, G., Wang, L.: Multisports: a multi-person video dataset of spatio-temporally localized sports actions. In: ICCV, pp. 13536\u201313545 (2021)","DOI":"10.1109\/ICCV48922.2021.01328"},{"key":"15_CR21","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"issue":"10","key":"15_CR22","doi-asserted-by":"publisher","first-page":"2684","DOI":"10.1109\/TPAMI.2019.2916873","volume":"42","author":"J Liu","year":"2019","unstructured":"Liu, J., Shahroudy, A., Perez, M., Wang, G., Duan, L.Y., Kot, A.C.: NTU RGB+D 120: a large-scale benchmark for 3D human activity understanding. IEEE Trans. Pattern Anal. Mach. Intell. 42(10), 2684\u20132701 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"15_CR23","unstructured":"Van\u00a0der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11) (2008)"},{"key":"15_CR24","doi-asserted-by":"publisher","unstructured":"Oh, S., et al.: A large-scale benchmark dataset for event recognition in surveillance video. In: CVPR 2011, pp. 3153\u20133160 (2011). https:\/\/doi.org\/10.1109\/CVPR.2011.5995586","DOI":"10.1109\/CVPR.2011.5995586"},{"key":"15_CR25","doi-asserted-by":"publisher","first-page":"2259","DOI":"10.1007\/s10462-020-09904-8","volume":"54","author":"P Pareek","year":"2021","unstructured":"Pareek, P., Thakkar, A.: A survey on video-based human action recognition: recent updates, datasets, challenges, and applications. Artif. Intell. Rev. 54, 2259\u20132322 (2021)","journal-title":"Artif. Intell. Rev."},{"key":"15_CR26","unstructured":"Piergiovanni, A., Ryoo, M.: Avid dataset: anonymized videos from diverse countries. In: Advances in Neural Information Processing Systems, vol. 33, pp. 16711\u201316721 (2020)"},{"key":"15_CR27","doi-asserted-by":"publisher","unstructured":"Rohrbach, M., Amin, S., Andriluka, M., Schiele, B.: A database for fine grained activity detection of cooking activities. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1194\u20131201 (2012). https:\/\/doi.org\/10.1109\/CVPR.2012.6247801","DOI":"10.1109\/CVPR.2012.6247801"},{"key":"15_CR28","doi-asserted-by":"publisher","unstructured":"Schuldt, C., Laptev, I., Caputo, B.: Recognizing human actions: a local SVM approach. In: Proceedings of the 17th International Conference on Pattern Recognition, 2004. ICPR 2004, vol.\u00a03, pp. 32\u201336 (2004). https:\/\/doi.org\/10.1109\/ICPR.2004.1334462","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Sener, F., et al.: Assembly101: a large-scale multi-view video dataset for understanding procedural activities. In: CVPR, pp. 21096\u201321106 (2022)","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"Serpush, F., Rezaei, M.: Complex human action recognition in live videos using hybrid FR-DL method. CoRR abs\/2007.02811 (2020)","DOI":"10.31219\/osf.io\/fsvz2"},{"key":"15_CR31","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.T., Wang, G.: NTU RGB+D: a large scale dataset for 3D human activity analysis. In: CVPR, pp. 1010\u20131019 (2016)","DOI":"10.1109\/CVPR.2016.115"},{"key":"15_CR32","doi-asserted-by":"crossref","unstructured":"Shao, D., Zhao, Y., Dai, B., Lin, D.: Finegym: a hierarchical video dataset for fine-grained action understanding. In: CVPR, pp. 2616\u20132625 (2020)","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"15_CR33","doi-asserted-by":"crossref","unstructured":"Shen, Y., Elhamifar, E.: Semi-weakly-supervised learning of complex actions from instructional task videos. In: CVPR 2022, pp. 3344\u20133354 (2022)","DOI":"10.1109\/CVPR52688.2022.00334"},{"key":"15_CR34","unstructured":"Sigurdsson, G.A., Gupta, A., Schmid, C., Farhadi, A., Alahari, K.: Charades-ego: a large-scale dataset of paired third and first person videos. arXiv preprint arXiv:1804.09626 (2018)"},{"key":"15_CR35","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"15_CR36","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: Videomae: masked autoencoders are data-efficient learners for self-supervised video pre-training. In: Advances in Neural Information Processing Systems, vol. 35, pp. 10078\u201310093 (2022)"},{"key":"15_CR37","unstructured":"Tunstall, L., et al.: Efficient few-shot learning without prompts. arXiv preprint arXiv:2209.11055 (2022)"},{"key":"15_CR38","unstructured":"Wang, M., Xing, J., Liu, Y.: Actionclip: a new paradigm for video action recognition. arXiv preprint arXiv:2109.08472 (2021)"},{"issue":"5","key":"15_CR39","doi-asserted-by":"publisher","first-page":"1005","DOI":"10.3390\/s19051005","volume":"19","author":"HB Zhang","year":"2019","unstructured":"Zhang, H.B., et al.: A comprehensive survey of vision-based human action recognition methods. Sensors 19(5), 1005 (2019)","journal-title":"Sensors"},{"key":"15_CR40","unstructured":"Zhu, Y., et al.: A comprehensive study of deep video action recognition. arXiv preprint arXiv:2012.06567 (2020)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73229-4_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,24]],"date-time":"2024-10-24T15:07:49Z","timestamp":1729782469000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73229-4_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,25]]},"ISBN":["9783031732287","9783031732294"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73229-4_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,25]]},"assertion":[{"value":"25 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}