{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T18:16:36Z","timestamp":1771265796780,"version":"3.50.1"},"publisher-location":"Cham","reference-count":49,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726699","type":"print"},{"value":"9783031726705","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72670-5_5","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"75-92","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":31,"title":["VideoAgent: A Memory-Augmented Multimodal Agent for\u00a0Video Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-5635-4320","authenticated-orcid":false,"given":"Yue","family":"Fan","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5609-3822","authenticated-orcid":false,"given":"Xiaojian","family":"Ma","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6426-1248","authenticated-orcid":false,"given":"Rujie","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4709-4003","authenticated-orcid":false,"given":"Yuntao","family":"Du","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1318-5123","authenticated-orcid":false,"given":"Jiaqi","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4424-4352","authenticated-orcid":false,"given":"Zhi","family":"Gao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1185-5365","authenticated-orcid":false,"given":"Qing","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"5_CR1","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Gao, Z., et al.: CLOVA: a closed-loop visual assistant with tool usage and update. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52733.2024.01259"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Gong, R., et\u00a0al.: MindAgent: emergent gaming interaction. arXiv preprint arXiv:2309.09971 (2023)","DOI":"10.18653\/v1\/2024.findings-naacl.200"},{"key":"5_CR4","unstructured":"Grauman, K., et\u00a0al.: Ego4D: around the world in 3,000 hours of egocentric video. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2022)"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Gupta, T., Kembhavi, A.: Visual programming: compositional visual reasoning without training. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Temporal alignment networks for long-term video. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00292"},{"key":"5_CR7","unstructured":"Hou, Z., et al.: GroundNLQ@ Ego4D natural language queries challenge 2023. arXiv preprint arXiv:2306.15255 (2023)"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Jia, B., Chen, Y., Huang, S., Zhu, Y., Zhu, S.C.: Lemma: a multi-view dataset for learning multi-agent multi-task activities. In: European Conference on Computer Vision (ECCV) (2020)","DOI":"10.1007\/978-3-030-58574-7_46"},{"key":"5_CR9","unstructured":"Jia, B., Lei, T., Zhu, S.C., Huang, S.: EgoTaskQA: understanding human tasks in egocentric videos. In: Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"5_CR10","doi-asserted-by":"crossref","unstructured":"Korbar, B., Xian, Y., Tonioni, A., Zisserman, A., Tombari, F.: Text-conditioned resampler for long form video understanding. arXiv preprint arXiv:2312.11897 (2023)","DOI":"10.1007\/978-3-031-73016-0_16"},{"key":"5_CR11","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning (ICML) (2023)"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Lin, B., Zhu, B., Ye, Y., Ning, M., Jin, P., Yuan, L.: Video-LLaVA: learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"5_CR13","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems (NeurIPS) (2024)"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.S.: Video-ChatGPT: towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424 (2023)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"5_CR15","unstructured":"Mangalam, K., Akshulakov, R., Malik, J.: EgoSchema: a diagnostic benchmark for very long-form video language understanding. In: Advances in Neural Information Processing Systems (NeurIPS) (2024)"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100m: learning a text-video embedding by watching hundred million narrated video clips. In: International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"5_CR17","unstructured":"OpenAI: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"5_CR18","unstructured":"Oquab, M., et al..: DINOv2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"5_CR19","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (ICML) (2021)"},{"key":"5_CR20","unstructured":"Schick, T., et al.: Toolformer: Language models can teach themselves to use tools. In: Advances in Neural Information Processing Systems (NeurIPS) (2024)"},{"key":"5_CR21","unstructured":"Shafiullah, N.M.M., Paxton, C., Pinto, L., Chintala, S., Szlam, A.: Clip-fields: weakly supervised semantic fields for robotic memory. arXiv preprint arXiv:2210.05663 (2022)"},{"key":"5_CR22","doi-asserted-by":"crossref","unstructured":"Song, E., et\u00a0al.: MovieChat: from dense token to sparse memory for long video understanding. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Sur\u00eds, D., Menon, S., Vondrick, C.: ViperGPT: visual inference via python execution for reasoning. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., Fidler, S.: MovieQA: understanding stories in movies through question-answering. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.501"},{"key":"5_CR25","unstructured":"Tay, Y., et al.: Long range arena: a benchmark for efficient transformers. arXiv preprint arXiv:2011.04006 (2020)"},{"key":"5_CR26","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"5_CR28","unstructured":"Wang, Y., et\u00a0al.: InternVid: a large-scale video-text dataset for multimodal understanding and generation. arXiv preprint arXiv:2307.06942 (2023)"},{"key":"5_CR29","unstructured":"Wang, Y., et al.: InternVideo: general video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"5_CR30","unstructured":"Wang, Y., Yang, Y., Ren, M.: LifelongMemory: Leveraging LLMs for answering queries in egocentric videos. arXiv preprint arXiv:2312.05269 (2023)"},{"key":"5_CR31","unstructured":"Wang, Y., Wang, Y., Wu, P., Liang, J., Zhao, D., Zheng, Z.: LSTP: language-guided spatial-temporal prompt learning for long-form video-text understanding. arXiv preprint arXiv:2402.16050 (2024)"},{"key":"5_CR32","doi-asserted-by":"crossref","unstructured":"Wiles, O., Carreira, J., Barr, I., Zisserman, A., Malinowski, M.: Compressed vision for efficient video understanding. In: Asian Conference on Computer Vision (ACCV) (2022)","DOI":"10.1007\/978-3-031-26293-7_40"},{"key":"5_CR33","doi-asserted-by":"crossref","unstructured":"Wu, C.Y., Krahenbuhl, P.: Towards long-form video understanding. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00192"},{"key":"5_CR34","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual ChatGPT: talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671 (2023)"},{"key":"5_CR35","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., Chua, T.S.: NExT-QA: next phase of question-answering to explaining temporal actions. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"5_CR36","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Zero-shot video question answering via frozen bidirectional language models. In: Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"5_CR37","unstructured":"Yang, Z., Chen, G., Li, X., Wang, W., Yang, Y.: DoraemonGPT: toward understanding dynamic scenes with large language models (exemplified as a video agent). In: International Conference on Machine Learning (ICML) (2024)"},{"key":"5_CR38","unstructured":"Ye, Q., et al.: mPLUG-Owl: modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"5_CR39","unstructured":"Yu, S., Cho, J., Yadav, P., Bansal, M.: Self-chained image-language model for video localization and question answering. In: Advances in Neural Information Processing Systems (NeurIPS) (2024)"},{"key":"5_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: A simple LLM framework for long-range video question-answering. arXiv preprint arXiv:2312.17235 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.1209"},{"key":"5_CR41","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-LLaMa: an instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"5_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., Zhou, J.T.: Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931 (2020)","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"5_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, S., Peng, H., Fu, J., Luo, J.: Learning 2D temporal adjacent networks for moment localization with natural language. In: AAAI Conference on Artificial Intelligence (AAAI) (2020)","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"5_CR44","doi-asserted-by":"publisher","unstructured":"Zhang, Y., Sun, P., Jiang, Y., Yu, D., Weng, F., Yuan, Z., Luo, P., Liu, W., Wang, X.: Bytetrack: Multi-object tracking by associating every detection box. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision (ECCV). Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_1","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"5_CR45","unstructured":"Zhang, Y., Zhang, K., Li, B., Pu, F., Setiadharma, C.A., Yang, J., Liu, Z.: WorldQA: multimodal world knowledge in videos through long-chain reasoning. arXiv preprint arXiv:2405.03272 (2024)"},{"key":"5_CR46","unstructured":"Zhao, H., et al.: MMICL: empowering vision-language model with multi-modal in-context learning. arXiv preprint arXiv:2309.07915 (2023)"},{"key":"5_CR47","doi-asserted-by":"crossref","unstructured":"Zhao, Y., et al.: DETRs beat YOLOs on real-time object detection. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01605"},{"key":"5_CR48","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Misra, I., Kr\u00e4henb\u00fchl, P., Girdhar, R.: Learning video representations from large language models. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"5_CR49","doi-asserted-by":"crossref","unstructured":"Zhu, C., et al.: EgoObjects: a large-scale egocentric dataset for fine-grained object understanding. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01840"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72670-5_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:20:42Z","timestamp":1732828842000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72670-5_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726699","9783031726705"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72670-5_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}