{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T17:36:29Z","timestamp":1777570589693,"version":"3.51.4"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726514","type":"print"},{"value":"9783031726521","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72652-1_10","type":"book-chapter","created":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:29:02Z","timestamp":1730190542000},"page":"160-176","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["PiTe: Pixel-Temporal Alignment for\u00a0Large Video-Language Model"],"prefix":"10.1007","author":[{"given":"Yang","family":"Liu","sequence":"first","affiliation":[]},{"given":"Pengxiang","family":"Ding","sequence":"additional","affiliation":[]},{"given":"Siteng","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Min","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Han","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Donglin","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,30]]},"reference":[{"key":"10_CR1","unstructured":"Alayrac, J., et al.: Flamingo: a visual language model for few-shot learning. In: Proceedings of NeurIPS (2022)"},{"key":"10_CR2","unstructured":"Arthur, D., Vassilvitskii, S.: K-means++: the advantages of careful seeding. In: Proceedings of SODA, pp. 1027\u20131035 (2007)"},{"key":"10_CR3","unstructured":"Awadalla, A., et al.: OpenFlamingo: an open-source framework for training large autoregressive vision-language models. CoRR (2023)"},{"key":"10_CR4","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. In: Proceedings of NeurIPS (2020)"},{"key":"10_CR5","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023)"},{"key":"10_CR6","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: Proceedings of NeurIPS (2023)"},{"key":"10_CR7","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of AACL, pp. 4171\u20134186 (2019)"},{"key":"10_CR8","unstructured":"Dosovitskiy, A., et al.: An image is worth 16\u00a0$$\\times $$\u00a016 words: transformers for image recognition at scale. In: Proceedings of ICLR (2021)"},{"key":"10_CR9","doi-asserted-by":"crossref","unstructured":"Du, Z., et al.: GLM: general language model pretraining with autoregressive blank infilling. In: Proceedings of ACL, pp. 320\u2013335 (2022)","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"10_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"517","DOI":"10.1007\/978-3-030-58539-6_31","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Fujita","year":"2020","unstructured":"Fujita, S., Hirao, T., Kamigaito, H., Okumura, M., Nagata, M.: SODA: story oriented dense video captioning evaluation framework. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12351, pp. 517\u2013531. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58539-6_31"},{"key":"10_CR11","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of CVPR, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"10_CR12","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. In: Proceedings of ICLR (2022)"},{"key":"10_CR13","doi-asserted-by":"crossref","unstructured":"Huang, B., Wang, X., Chen, H., Song, Z., Zhu, W.: VTimeLLM: empower LLM to grasp video moments. In: Proceedings of CVPR, pp. 14271\u201314280 (2024)","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"10_CR14","unstructured":"Kaplan, J., et al.: Scaling laws for neural language models. CoRR (2020)"},{"key":"10_CR15","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Niebles, J.C.: Dense-captioning events in videos. In: Proceedings of ICCV, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"10_CR16","doi-asserted-by":"crossref","unstructured":"Lavie, A., Agarwal, A.: METEOR: an automatic metric for MT evaluation with high levels of correlation with human judgments. In: Proceedings of WMT@ACL, pp. 228\u2013231 (2007)","DOI":"10.3115\/1626355.1626389"},{"key":"10_CR17","doi-asserted-by":"crossref","unstructured":"Le\u00a0Moing, G., Ponce, J., Schmid, C.: Dense optical tracking: connecting the dots. In: Proceedings of CVPR, pp. 19187\u201319197 (2024)","DOI":"10.1109\/CVPR52733.2024.01815"},{"key":"10_CR18","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.C.H.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of ICML, pp. 19730\u201319742 (2023)"},{"key":"10_CR19","unstructured":"Li, K., et al.: VideoChat: chat-centric video understanding. CoRR (2023)"},{"key":"10_CR20","doi-asserted-by":"crossref","unstructured":"Lin, J., Yin, H., Ping, W., Molchanov, P., Shoeybi, M., Han, S.: VILA: on pre-training for visual language models. In: Proceedings of CVPR, pp. 26689\u201326699 (2024)","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"10_CR21","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: Proceedings of CVPR, pp. 26296\u201326306 (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"10_CR22","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Proceedings of NeurIPS (2023)"},{"key":"10_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Y., Ping, H., Zhang, D., Sun, Q., Li, S., Zhou, G.: Comment-aware multi-modal heterogeneous pre-training for humor detection in short-form videos. In: Proceedings of ECAI, pp. 1568\u20131575 (2023)","DOI":"10.3233\/FAIA230438"},{"key":"10_CR24","doi-asserted-by":"crossref","unstructured":"Liu, Y., Shen, T., Zhang, D., Sun, Q., Li, S., Zhou, G.: Comment-aided video-language alignment via contrastive pre-training for short-form video humor detection. In: Proceedings of ICMR, pp. 442\u2013450 (2024)","DOI":"10.1145\/3652583.3658094"},{"key":"10_CR25","unstructured":"Luo, R., et al.: Valley: video assistant with large language model enhanced ability. CoRR (2023)"},{"key":"10_CR26","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H.A., Khan, S.H., Khan, F.S.: Video-ChatGPT: towards detailed video understanding via large vision and language models (2024)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"10_CR27","unstructured":"Munasinghe, S., et al.: PG-Video-LLaVA: pixel grounding large video-language models. CoRR (2023)"},{"key":"10_CR28","unstructured":"OpenAI: ChatGPT: optimizing language models for dialogue (2023)"},{"key":"10_CR29","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: Proceedings of NeurIPS (2022)"},{"key":"10_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"647","DOI":"10.1007\/978-3-030-58558-7_38","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Pont-Tuset","year":"2020","unstructured":"Pont-Tuset, J., Uijlings, J., Changpinyo, S., Soricut, R., Ferrari, V.: Connecting vision and language with localized narratives. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 647\u2013664. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_38"},{"key":"10_CR31","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of ICML, pp. 8748\u20138763 (2021)"},{"key":"10_CR32","unstructured":"Radford, A., Narasimhan, K.: Improving language understanding by generative pre-training (2018)"},{"key":"10_CR33","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners (2019)"},{"key":"10_CR34","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 140:1\u2013140:67 (2020)"},{"key":"10_CR35","doi-asserted-by":"crossref","unstructured":"Rasheed, H., et al.: GLaMM: pixel grounding large multimodal model. In: Proceedings of CVPR, pp. 13009\u201313018 (2024)","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"10_CR36","unstructured":"Scao, T.L., et\u00a0al.: BLOOM: a 176B-parameter open-access multilingual language model. CoRR (2022)"},{"key":"10_CR37","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. CoRR (2023)"},{"key":"10_CR38","unstructured":"Touvron, H., et al.: LLaMA 2: open foundation and fine-tuned chat models. CoRR (2023)"},{"key":"10_CR39","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of CVPR, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10_CR40","unstructured":"Wang, Y., et al.: InternVid: a large-scale video-text dataset for multimodal understanding and generation. In: Proceedings of ICLR (2024)"},{"key":"10_CR41","unstructured":"Xu, D., et al.: Video question answering via gradually refined attention over appearance and motion. In: Proceedings of the 2017 ACM on Multimedia Conference, MM 2017, Mountain View, CA, USA, 23\u201327 October 2017, pp. 1645\u20131653 (2017)"},{"key":"10_CR42","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: Pixel-aligned language model. In: Proceedings of CVPR, pp. 13030\u201313039 (2024)","DOI":"10.1109\/CVPR52733.2024.01238"},{"key":"10_CR43","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of CVPR, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"10_CR44","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Zero-shot video question answering via frozen bidirectional language models. In: Proceedings of NeurIPS (2022)"},{"key":"10_CR45","doi-asserted-by":"crossref","unstructured":"Yu, Z., et al.: ActivityNet-QA: a dataset for understanding complex web videos via question answering. In: Proceedings of AAAI, pp. 9127\u20139134 (2019)","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"10_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-LLaMA: an instruction-tuned audio-visual language model for video understanding. In: Proceedings of EMNLP, pp. 543\u2013553 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"10_CR47","doi-asserted-by":"publisher","unstructured":"Zhang, M., Huang, S., Li, W., Wang, D.: Tree structure-aware few-shot image classification via hierarchical aggregation. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) Proceedings of ECCV, pp. 453\u2013470. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20044-1_26","DOI":"10.1007\/978-3-031-20044-1_26"},{"key":"10_CR48","unstructured":"Zhang, R., et al.: LLaMA-adapter: efficient fine-tuning of large language models with zero-initialized attention. In: Proceedings of ICLR (2024)"},{"key":"10_CR49","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models. CoRR (2022)"},{"key":"10_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Li, Z., Zhang, M.: Efficient second-order TreeCRF for neural dependency parsing. In: Proceedings of ACL, pp. 3295\u20133305 (2020)","DOI":"10.18653\/v1\/2020.acl-main.302"},{"key":"10_CR51","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Zhou, H., Li, Z.: Fast and accurate neural CRF constituency parsing. In: Proceedings of IJCAI, pp. 4046\u20134053 (2020)","DOI":"10.24963\/ijcai.2020\/560"},{"key":"10_CR52","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: Proceedings of ICLR (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72652-1_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:33:20Z","timestamp":1730190800000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72652-1_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,30]]},"ISBN":["9783031726514","9783031726521"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72652-1_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,30]]},"assertion":[{"value":"30 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}