{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T16:24:52Z","timestamp":1778171092591,"version":"3.51.4"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730382","type":"print"},{"value":"9783031730399","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73039-9_12","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"202-218","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":39,"title":["LITA: Language Instructed Temporal-Localization Assistant"],"prefix":"10.1007","author":[{"given":"De-An","family":"Huang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shijia","family":"Liao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Subhashree","family":"Radhakrishnan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongxu","family":"Yin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pavlo","family":"Molchanov","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiding","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jan","family":"Kautz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"12_CR1","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"key":"12_CR2","unstructured":"Awadalla, A., et al.: OpenFlamingo: an open-source framework for training large autoregressive vision-language models. arXiv:2308.01390 (2023)"},{"key":"12_CR3","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: NeurIPS (2020)"},{"key":"12_CR4","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"12_CR5","unstructured":"Chen, J., et al.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv:2310.09478 (2023)"},{"key":"12_CR6","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"12_CR7","unstructured":"Chowdhery, A., et\u00a0al.: PaLM: scaling language modeling with pathways. arXiv:2204.02311 (2022)"},{"key":"12_CR8","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning (2023)"},{"issue":"11","key":"12_CR9","doi-asserted-by":"publisher","first-page":"4125","DOI":"10.1109\/TPAMI.2020.2991965","volume":"43","author":"D Damen","year":"2021","unstructured":"Damen, D., et al.: The EPIC-KITCHENS dataset: collection, challenges and baselines. IEEE Trans. PAMI 43(11), 4125\u20134141 (2021)","journal-title":"IEEE Trans. PAMI"},{"key":"12_CR10","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/s11263-021-01531-2","volume":"130","author":"D Damen","year":"2022","unstructured":"Damen, D., et al.: Rescaling egocentric vision: collection, pipeline and challenges for epic-kitchens-100. IJCV 130, 33\u201355 (2022)","journal-title":"IJCV"},{"key":"12_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL (2019)"},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: SlowFast networks for video recognition. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"12_CR13","doi-asserted-by":"crossref","unstructured":"Huang, B., Wang, X., Chen, H., Song, Z., Zhu, W.: VTimeLLM: empower LLM to grasp video moments. arXiv preprint arXiv:2311.18445 (2023)","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"12_CR14","doi-asserted-by":"crossref","unstructured":"Huang, G., Pang, B., Zhu, Z., Rivera, C., Soricut, R.: Multimodal pretraining for dense video captioning. In: AACL-IJCNLP (2020)","DOI":"10.18653\/v1\/2020.aacl-main.48"},{"key":"12_CR15","unstructured":"Jiang, Y.G., et al.: THUMOS challenge: action recognition with a large number of classes (2014). https:\/\/www.crcv.ucf.edu\/THUMOS14\/home.html"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Niebles, J.C.: Dense-captioning events in videos. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Arslan, A.B., Serre, T.: The language of actions: Recovering the syntax and semantics of goal-directed human activities. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.105"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: LISA: reasoning segmentation via large language model. arXiv:2308.00692 (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"12_CR19","unstructured":"Li, B., et al.: MIMIC-IT: multi-modal in-context instruction tuning. arXiv:2306.05425 (2023)"},{"key":"12_CR20","unstructured":"Li, K., et al.: VideoChat: chat-centric video understanding. arXiv:2305.06355 (2023)"},{"key":"12_CR21","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, C., Jia, J.: LLaMA-VID: an image is worth 2 tokens in large language models. arXiv preprint arXiv:2311.17043 (2023)","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"12_CR22","doi-asserted-by":"crossref","unstructured":"Lin, B., Zhu, B., Ye, Y., Ning, M., Jin, P., Yuan, L.: Video-LLaVA: learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"12_CR24","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"12_CR25","unstructured":"Luo, R., et al.: Valley: video assistant with large language model enhanced ability (2023)"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"12_CR27","unstructured":"Maaz, M., Hanoona\u00a0Rasheed, S.K., Khan, F.: Video-ChatGPT: towards detailed video understanding via large vision and language models. arXiv: 2306.05424 (2023)"},{"key":"12_CR28","unstructured":"OpenAI: GPT-4 technical report. arXiv:2303.08774 (2023)"},{"key":"12_CR29","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. arXiv:2306.14824 (2023)"},{"key":"12_CR30","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"12_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in Homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"12_CR32","doi-asserted-by":"crossref","unstructured":"Tang, Y., et al.: COIN: a large-scale dataset for comprehensive instructional video analysis. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00130"},{"key":"12_CR33","unstructured":"Touvron, H., et\u00a0al.: LLaMA: open and efficient foundation language models. arXiv:2302.13971 (2023)"},{"key":"12_CR34","unstructured":"Touvron, H., et\u00a0al.: LLaMA 2: open foundation and fine-tuned chat models. arXiv:2307.09288 (2023)"},{"key":"12_CR35","unstructured":"Wu, S., Fei, H., Qu, L., Ji, W., Chua, T.S.: NExT-GPT: any-to-any multimodal LLM. arXiv:2309.05519 (2023)"},{"key":"12_CR36","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., Chua, T.S.: NExT-QA: next phase of question-answering to explaining temporal actions. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"12_CR37","doi-asserted-by":"crossref","unstructured":"Yan, S., et al.: UnLoc: a unified framework for video localization tasks. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01253"},{"key":"12_CR38","doi-asserted-by":"crossref","unstructured":"Yang, A., et al.: Vid2Seq: large-scale pretraining of a visual language model for dense video captioning. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"12_CR39","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-LLaMA: an instruction-tuned audio-visual language model for video understanding. arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"12_CR40","unstructured":"Zhang, R., et al.: LLaMA-adapter: efficient fine-tuning of language models with zero-init attention. arXiv:2303.16199 (2023)"},{"key":"12_CR41","unstructured":"Zhang, Y., et al.: Siren\u2019s song in the AI ocean: a survey on hallucination in large language models. arXiv:2309.01219 (2023)"},{"key":"12_CR42","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.J.: Towards automatic learning of procedures from web instructional videos. In: AAAI (2018)","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"12_CR43","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73039-9_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T14:15:39Z","timestamp":1732976139000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73039-9_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730382","9783031730399"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73039-9_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}