{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T12:01:03Z","timestamp":1743076863304,"version":"3.40.3"},"publisher-location":"Cham","reference-count":26,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031783463"},{"type":"electronic","value":"9783031783470"}],"license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78347-0_17","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T14:53:43Z","timestamp":1733064823000},"page":"250-265","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Act-ChatGPT: Introducing Action Features into Multi-modal Large Language Models for\u00a0Video Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-8926-3183","authenticated-orcid":false,"given":"Yuto","family":"Nakamizo","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0431-183X","authenticated-orcid":false,"given":"Keiji","family":"Yanai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"17_CR1","unstructured":"Alec, R., Jong, Wook, K., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, vol.\u00a0139, pp. 8748\u20138763 (2021)"},{"key":"17_CR2","unstructured":"Chenfei, W., Shengming, Y., Weizhen, Q., Xiaodong, W., Zecheng, T., Nan, D.: Visual ChatGPT: talking, drawing and editing with visual foundation models. arXiv:2303.04671 (2023)"},{"key":"17_CR3","unstructured":"Chiang, W.L., Li, Z., et\u00a0al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Hang, Z., Xin, L., Lidong, B.: Video-LLaMA: an instruction-tuned audio-visual language model for video understanding. arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"17_CR5","unstructured":"Haotian, L., Chunyuan, L., Qingyang, W., Yong, Jae, L.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"17_CR7","unstructured":"Hugo, T., Louis, M., et\u00a0al.: LLaMA 2: open foundation and fine-tuned chat models. arXiv:2307.09288 (2023)"},{"key":"17_CR8","unstructured":"Hugo, T., et al.: LLaMA: open and efficient foundation language models. arXiv:2302.13971 (2023)"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Jin, P., Takanobu, R., Zhang, C., Cao, X., Yuan, L.: Chat-UniVi: unified visual representation empowers large language models with image and video understanding. arXiv:2311.08046 (2023)","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"17_CR10","unstructured":"Kaplan, J., et al.: Scaling laws for neural language models. arXiv:2001.08361 (2020)"},{"key":"17_CR11","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv:1705.06950 (2017)"},{"key":"17_CR12","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML, pp. 19730\u201319742 (2023)"},{"key":"17_CR13","unstructured":"Li, K., et al.: VideoChat: chat-centric video understanding. arXiv:2305.06355 (2023)"},{"key":"17_CR14","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: MVBench: a comprehensive multi-modal video understanding benchmark. arXiv:2311.17005 (2023)","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: UniFormerV2: unlocking the potential of image ViTs for video understanding. In: ICCV, pp. 1632\u20131643 (2023)","DOI":"10.1109\/ICCV51070.2023.00157"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: Unmasked Teacher: towards training-efficient video foundation models. In: ICCV, pp. 19948\u201319960 (2023)","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"17_CR17","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, C., Jia, J.: LLaMA-VID: an image is worth 2 tokens in large language models. arXiv:2311.17043 (2023)","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Lin, B., et al.: Video-LLaVA: learning united visual representation by alignment before projection. arXiv:2311.10122 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"17_CR19","unstructured":"Muhammad, M., Hanoona, R., Salman, K., Fahad, Shahbaz, K.: Video-ChatGPT: towards detailed video understanding via large vision and language models. arXiv:2306.05424 (2023)"},{"key":"17_CR20","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, vol.\u00a030 (2017)"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: VideoMAE V2: scaling video masked autoencoders with dual masking. In: CVPR, pp. 14549\u201314560 (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"17_CR22","unstructured":"Wei, J., et al.: Finetuned language models are zero-shot learners. In: ICLR (2022)"},{"key":"17_CR23","unstructured":"Xiuyuan, C., Yuan, L., Yuchen, Z., Weiran, H.: AutoEval-video: an automatic benchmark for assessing large vision language models in open-ended video question answering. arXiv:2311.14906 (2023)"},{"key":"17_CR24","unstructured":"Zhao, W.X., Zhou, K., et\u00a0al.: A survey of large language models. arXiv:2303.18223 (2023)"},{"key":"17_CR25","unstructured":"Zhu, B., et al.: LanguageBind: extending video-language pretraining to n-modality by language-based semantic alignment. arXiv:2310.01852 (2023)"},{"key":"17_CR26","unstructured":"Zoph, B., et al.: Emergent abilities of large language models. In: Proceedings of Transactions on Machine Learning Research (2022)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78347-0_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T15:03:24Z","timestamp":1733065404000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78347-0_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"ISBN":["9783031783463","9783031783470"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78347-0_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}