{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,15]],"date-time":"2025-08-15T00:59:49Z","timestamp":1755219589389,"version":"3.43.0"},"reference-count":31,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,26]],"date-time":"2025-05-26T00:00:00Z","timestamp":1748217600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,26]],"date-time":"2025-05-26T00:00:00Z","timestamp":1748217600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100004721","name":"University of Tokyo","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004721","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,26]]},"DOI":"10.1109\/fg61629.2025.11099276","type":"proceedings-article","created":{"date-parts":[[2025,8,6]],"date-time":"2025-08-06T17:55:00Z","timestamp":1754502900000},"page":"1-10","source":"Crossref","is-referenced-by-count":0,"title":["ActRecognition-GPT: Utilizing Multimodal Large Language Models for Spatiotemporal Action Recognition in Nursery Videos"],"prefix":"10.1109","author":[{"given":"Kenta","family":"Watanabe","sequence":"first","affiliation":[{"name":"The University of Tokyo,Graduate School of Information Science and Technology,Department of Information and Communication Engineering,Tokyo,Japan"}]},{"given":"Shuntaro","family":"Masuda","sequence":"additional","affiliation":[{"name":"The University of Tokyo,Graduate School of Information Science and Technology,Department of Information and Communication Engineering,Tokyo,Japan"}]},{"given":"Ling","family":"Xiao","sequence":"additional","affiliation":[{"name":"Hokkaido University,Graduate School of Information Science,Sapporo,Japan"}]},{"given":"Toshihiko","family":"Yamasaki","sequence":"additional","affiliation":[{"name":"The University of Tokyo,Graduate School of Information Science and Technology,Department of Information and Communication Engineering,Tokyo,Japan"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","volume-title":"arXiv preprint arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref2","first-page":"148","article-title":"An analysis of the labor supply of childcare providers","volume":"202","author":"Asai","year":"2020","journal-title":"The Economic Analysis"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/CVPR.2011.5995667"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1109\/ACCESS.2023.3274542"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1109\/CVPR46437.2021.00610"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1109\/cvpr52733.2024.01599"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1109\/CVPR42600.2020.00028"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref9","article-title":"Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal 1 lms in video analysis","author":"Fu","year":"2024","journal-title":"arXiv preprint arXiv:2405.21075"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1109\/CVPR.2018.00633"},{"key":"ref11","article-title":"A video dataset for action detection and understanding in nursery schools","author":"Iida","year":"2020","journal-title":"JSAI"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.1109\/TPAMI.2012.59"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1109\/CVPR52733.2024.01263"},{"doi-asserted-by":"publisher","key":"ref14","DOI":"10.1109\/CVPR52733.2024.02095"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1016\/j.imavis.2023.104726"},{"key":"ref16","article-title":"Leveraging yolo-world and gpt-4v lmms for zero-shot person detection and action recognition in drone imagery","author":"Limberg","year":"2024","journal-title":"arXiv preprint arXiv:2404.01571"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/ICCV51070.2023.00267"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1145\/3474085.3475503"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1109\/CVPR52729.2023.00069"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1109\/CVPR.2016.91"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1186\/s40723-018-0043-4"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref23","article-title":"Actionclip: A new paradigm for video action recognition","author":"Wang","year":"2021","journal-title":"arXiv preprint arXiv:2109.08472"},{"doi-asserted-by":"publisher","key":"ref24","DOI":"10.1109\/CVPR52729.2023.00611"},{"key":"ref25","article-title":"Open-vocabulary spatiotemporal action detection","author":"Wu","year":"2024","journal-title":"arXiv preprint arXiv:2405.10832"},{"key":"ref26","article-title":"Gpt-4o: Visual perception performance of multimodal large language models in piglet activity understanding","author":"Wu","year":"2024","journal-title":"arXiv preprint arXiv:2406.09781"},{"key":"ref27","article-title":"A multi-task joint framework for real-time person search","author":"Ye","year":"2020","journal-title":"arXiv preprint arXiv:2012.06418"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1609\/aaai.v33i01.33019127"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1007\/978-3-031-91813-1_4"},{"doi-asserted-by":"publisher","key":"ref30","DOI":"10.1109\/CVPR52688.2022.01323"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.1109\/CVPR.2018.00054"}],"event":{"name":"2025 IEEE 19th International Conference on Automatic Face and Gesture Recognition (FG)","start":{"date-parts":[[2025,5,26]]},"location":"Tampa\/Clearwater, FL, USA","end":{"date-parts":[[2025,5,30]]}},"container-title":["2025 IEEE 19th International Conference on Automatic Face and Gesture Recognition (FG)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11099084\/11099070\/11099276.pdf?arnumber=11099276","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T05:20:58Z","timestamp":1754544058000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11099276\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,26]]},"references-count":31,"URL":"https:\/\/doi.org\/10.1109\/fg61629.2025.11099276","relation":{},"subject":[],"published":{"date-parts":[[2025,5,26]]}}}