{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:49:54Z","timestamp":1777657794897,"version":"3.51.4"},"publisher-location":"Cham","reference-count":66,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729515","type":"print"},{"value":"9783031729522","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72952-2_19","type":"book-chapter","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T05:02:02Z","timestamp":1727672522000},"page":"323-340","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":161,"title":["LLaMA-VID: An Image is Worth 2 Tokens in\u00a0Large Language Models"],"prefix":"10.1007","author":[{"given":"Yanwei","family":"Li","sequence":"first","affiliation":[]},{"given":"Chengyao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jiaya","family":"Jia","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,1]]},"reference":[{"key":"19_CR1","unstructured":"Sharegpt (2023). https:\/\/sharegpt.com\/"},{"key":"19_CR2","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"key":"19_CR3","unstructured":"Anthropic: Claude 2 (2023). https:\/\/www.anthropic.com\/index\/claude-2"},{"key":"19_CR4","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv:2308.12966 (2023)"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"19_CR6","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: NeurIPS (2020)"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"19_CR8","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: ACL (2011)"},{"key":"19_CR9","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. arXiv:2306.15195 (2023)"},{"key":"19_CR10","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv:1504.00325 (2015)"},{"key":"19_CR11","unstructured":"Chen, Y., et al.: LongLoRA: efficient fine-tuning of long-context large language models. arXiv:2309.12307 (2023)"},{"key":"19_CR12","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"19_CR13","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. arXiv:2305.06500 (2023)"},{"key":"19_CR14","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805 (2018)"},{"key":"19_CR15","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth $$16\\times 16$$ words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"19_CR16","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: EVA: exploring the limits of masked visual representation learning at scale. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"19_CR17","unstructured":"Fu, C., et al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv:2306.13394 (2023)"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"19_CR20","doi-asserted-by":"crossref","unstructured":"Gurari, D., et al.: Vizwiz grand challenge: answering visual questions from blind people. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00380"},{"key":"19_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"709","DOI":"10.1007\/978-3-030-58548-8_41","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Q Huang","year":"2020","unstructured":"Huang, Q., Xiong, Y., Rao, A., Wang, J., Lin, D.: MovieNet: a holistic dataset for movie understanding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 709\u2013727. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_41"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"19_CR23","unstructured":"IDEFICS: Introducing IDEFICS: an open reproduction of state-of-the-art visual language model (2023). https:\/\/huggingface.co\/blog\/idefics"},{"key":"19_CR24","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML (2021)"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferitGame: referring to objects in photographs of natural scenes. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"19_CR26","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. IJCV 123, 32\u201373 (2017)","journal-title":"IJCV"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: LISA: reasoning segmentation via large language model. arXiv:2308.00692 (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., Shan, Y.: SEED-Bench: benchmarking multimodal LLMs with generative comprehension. arXiv:2307.16125 (2023)","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"19_CR29","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv:2301.12597 (2023)"},{"key":"19_CR30","unstructured":"Li, K., et al.: VideoChat: chat-centric video understanding. arXiv:2305.06355 (2023)"},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"19_CR33","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeruIPS (2023)"},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Liu, R., Li, C., Ge, Y., Shan, Y., Li, T.H., Li, G.: One for all: video conversation is feasible without video instruction tuning. arXiv:2309.15785 (2023)","DOI":"10.1109\/CVPR52733.2024.01296"},{"key":"19_CR35","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach. arXiv:1907.11692 (2019)"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: MMBench: is your multi-modal model an all-around player? arXiv:2307.06281 (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"19_CR37","unstructured":"Lu, P., et al.: Learn to explain: multimodal reasoning via thought chains for science question answering. In: NeurIPS (2022)"},{"key":"19_CR38","unstructured":"Luo, R., et al.: Valley: video assistant with large language model enhanced ability. arXiv:2306.07207 (2023)"},{"key":"19_CR39","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.S.: Video-ChatGPT: towards detailed video understanding via large vision and language models. arXiv:2306.05424 (2023)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"19_CR40","unstructured":"Mangalam, K., Akshulakov, R., Malik, J.: EgoSchema: a diagnostic benchmark for very long-form video language understanding. In: NeurIPS (2024)"},{"key":"19_CR41","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"19_CR42","doi-asserted-by":"crossref","unstructured":"Mishra, A., Shekhar, S., Singh, A.K., Chakraborty, A.: OCR-VQA: visual question answering by reading text in images. In: ICDAR (2019)","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"19_CR43","unstructured":"OpenAI: ChatGPT (2023). https:\/\/openai.com\/blog\/chatgpt\/"},{"key":"19_CR44","unstructured":"OpenAI: Gpt-4 technical report (2023). arXiv:2303.08774"},{"key":"19_CR45","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: NeurIPS (2022)"},{"key":"19_CR46","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"19_CR47","doi-asserted-by":"publisher","first-page":"146","DOI":"10.1007\/978-3-031-20074-8_9","volume-title":"ECCV","author":"D Schwenk","year":"2022","unstructured":"Schwenk, D., Khandelwal, A., Clark, C., Marino, K., Mottaghi, R.: A-OKVQA: a benchmark for visual question answering using world knowledge. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13668, pp. 146\u2013162. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20074-8_9"},{"key":"19_CR48","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"19_CR49","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"742","DOI":"10.1007\/978-3-030-58536-5_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"O Sidorov","year":"2020","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., Singh, A.: TextCaps: a dataset for image captioning with reading comprehension. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 742\u2013758. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_44"},{"key":"19_CR50","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"19_CR51","doi-asserted-by":"crossref","unstructured":"Song, E., et al.: MovieChat: from dense token to sparse memory for long video understanding. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"19_CR52","unstructured":"Taori, R., et al.: Stanford alpaca: an instruction-following llama model (2023). https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"19_CR53","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. arXiv:2302.13971 (2023)"},{"key":"19_CR54","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"19_CR55","unstructured":"Wang, Y., et al.: InternVideo: general video foundation models via generative and discriminative learning. arXiv:2212.03191 (2022)"},{"key":"19_CR56","unstructured":"Wei, J., et al.: Finetuned language models are zero-shot learners. arXiv:2109.01652 (2021)"},{"key":"19_CR57","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual ChatGPT: talking, drawing and editing with visual foundation models. arXiv:2303.04671 (2023)"},{"key":"19_CR58","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"19_CR59","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Zero-shot video question answering via frozen bidirectional language models. In: NeurIPS (2022)"},{"key":"19_CR60","unstructured":"Yang, R., et al.: GPT4Tools: teaching large language model to use tools via self-instruction. arXiv:2305.18752 (2023)"},{"key":"19_CR61","unstructured":"Ye, Q., et al.: mPLUG-Owl: modularization empowers large language models with multimodality. arXiv:2304.14178 (2023)"},{"key":"19_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: A simple LLM framework for long-range video question-answering. arXiv:2312.17235 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.1209"},{"key":"19_CR63","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-LLaMA: an instruction-tuned audio-visual language model for video understanding. arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"19_CR64","unstructured":"Zhang, R., et al.: LLaMA-Adapter: efficient fine-tuning of language models with zero-init attention. arXiv:2303.16199 (2023)"},{"key":"19_CR65","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models. arXiv:2205.01068 (2022)"},{"key":"19_CR66","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72952-2_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:41:15Z","timestamp":1732830075000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72952-2_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"ISBN":["9783031729515","9783031729522"],"references-count":66,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72952-2_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"assertion":[{"value":"1 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}