{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:43:22Z","timestamp":1776883402215,"version":"3.51.2"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031781094","type":"print"},{"value":"9783031781100","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78110-0_15","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:53:28Z","timestamp":1733090008000},"page":"229-244","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Zero-Shot Spatio-Temporal Action Detection by\u00a0Enhancing Context-Relation Capability of\u00a0Vision-Language Models"],"prefix":"10.1007","author":[{"given":"Yasunori","family":"Babazaki","sequence":"first","affiliation":[]},{"given":"Takashi","family":"Shibata","sequence":"additional","affiliation":[]},{"given":"Toru","family":"Takahashi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Chen, S., et al.: Watch only once: an end-to-end video action detection framework. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00807"},{"key":"15_CR2","doi-asserted-by":"crossref","unstructured":"Cherti, M., et al.: Reproducible scaling laws for contrastive language-image learning. In: CVPR, pp. 2818\u20132829 (2023)","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"15_CR3","doi-asserted-by":"crossref","unstructured":"Faure, G.J., Chen, M.H., Lai, S.H.: Holistic interaction transformer network for action detection. In: WACV, pp. 3340\u20133350 (2023)","DOI":"10.1109\/WACV56688.2023.00334"},{"key":"15_CR4","unstructured":"Ge, Z., Liu, S., Wang, F., Li, Z., Sun, J.: Yolox: exceeding yolo series in 2021. ArXiv arxiv:2107.08430 (2021)"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Huang, W., Yeh, J.H., Faure, G.J., Chen, M.H., Lai, S.H.: Interaction-aware prompting for zero-shot spatio-temporal action detection. In: ICCVW, pp. 284\u2013293 (2023)","DOI":"10.1109\/ICCVW60793.2023.00036"},{"key":"15_CR6","unstructured":"Huang, X., Zhou, H., Yao, K., Han, K.: Froster: frozen clip is a strong teacher for open-vocabulary action recognition. ArXiv arxiv:2402.03241 (2024)"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Jhuang, H., Gall, J., Zuffi, S., Schmid, C., Black, M.J.: Towards understanding action recognition. In: ICCV, pp. 3192\u20133199 (2013)","DOI":"10.1109\/ICCV.2013.396"},{"key":"15_CR8","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. ArXiv arxiv:2102.05918 (2021)"},{"key":"15_CR9","doi-asserted-by":"crossref","unstructured":"Li, L.H., et al.: Grounded language-image pre-training. In: CVPR, pp. 10955\u201310965 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R.B., He, K., Hariharan, B., Belongie, S.J.: Feature pyramid networks for object detection. In: CVPR, pp. 936\u2013944 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"15_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"15_CR12","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS, vol.\u00a036 (2024)"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"Liu, S., et al.: Grounding dino: marrying dino with grounded pre-training for open-set object detection. ArXiv arxiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Ni, B., et al.: Expanding language-image pretrained models for general video recognition. In: ECCV 2022 (2022)","DOI":"10.1007\/978-3-031-19772-7_1"},{"key":"15_CR15","doi-asserted-by":"crossref","unstructured":"Pan, J., Chen, S., Shou, M.Z., Liu, Y., Shao, J., Li, H.: Actor-context-actor relation network for spatio-temporal action localization. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00053"},{"key":"15_CR16","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"15_CR17","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R.B., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. IEEE TPAMI 39, 1137\u20131149 (2015)","journal-title":"IEEE TPAMI"},{"key":"15_CR18","unstructured":"Soomro, K., Zamir, A., Shah, M.: Ucf101: a dataset of 101 human actions classes from videos in the wild. ArXiv arxiv:1212.0402 (2012)"},{"key":"15_CR19","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., Wang, J.: Deep high-resolution representation learning for human pose estimation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00584"},{"key":"15_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1007\/978-3-030-58555-6_5","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Tang","year":"2020","unstructured":"Tang, J., Xia, J., Mu, X., Pang, B., Lu, C.: Asynchronous interaction aggregation for action detection. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12360, pp. 71\u201387. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_5"},{"key":"15_CR21","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, vol.\u00a030 (2017)"},{"key":"15_CR22","doi-asserted-by":"publisher","first-page":"3349","DOI":"10.1109\/TPAMI.2020.2983686","volume":"43","author":"J Wang","year":"2020","unstructured":"Wang, J., et al.: Deep high-resolution representation learning for visual recognition. IEEE TPAMI 43, 3349\u20133364 (2020)","journal-title":"IEEE TPAMI"},{"key":"15_CR23","doi-asserted-by":"crossref","unstructured":"Wang, M., Xing, J., Mei, J., Liu, Y., Jiang, Y.: Actionclip: adapting language-image pretrained models for video action recognition. IEEE TNNLS (2023)","DOI":"10.1109\/TNNLS.2023.3331841"},{"key":"15_CR24","doi-asserted-by":"crossref","unstructured":"Wu, T., Cao, M., Gao, Z., Wu, G., Wang, L.: Stmixer: a one-stage sparse action detector. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01414"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R.B., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated residual transformations for deep neural networks. In: CVPR, pp. 5987\u20135995 (2017)","DOI":"10.1109\/CVPR.2017.634"},{"key":"15_CR26","unstructured":"Xue, H., et al.: Clip-vip: adapting pre-trained image-text model to video-language alignment. In: ICLR (2022)"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Bytetrack: multi-object tracking by associating every detection box. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Zhao, J., et\u00a0al.: Tuber: tubelet transformer for video action detection. In: CVPR, pp. 13598\u201313607 (2022)","DOI":"10.1109\/CVPR52688.2022.01323"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78110-0_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T23:33:17Z","timestamp":1733095997000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78110-0_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"ISBN":["9783031781094","9783031781100"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78110-0_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}