{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:02:39Z","timestamp":1775325759925,"version":"3.50.1"},"publisher-location":"Cham","reference-count":65,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730061","type":"print"},{"value":"9783031730078","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73007-8_9","type":"book-chapter","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T19:02:40Z","timestamp":1727722960000},"page":"140-158","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["PALM: Predicting Actions through Language Models"],"prefix":"10.1007","author":[{"given":"Sanghwan","family":"Kim","sequence":"first","affiliation":[]},{"given":"Daoji","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Yongqin","family":"Xian","sequence":"additional","affiliation":[]},{"given":"Otmar","family":"Hilliges","sequence":"additional","affiliation":[]},{"given":"Luc","family":"Van Gool","sequence":"additional","affiliation":[]},{"given":"Xi","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,1]]},"reference":[{"key":"9_CR1","unstructured":"Ahn, M., et\u00a0al.: Do as i can, not as i say: grounding language in robotic affordances. arXiv preprint arXiv:2204.01691 (2022)"},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Ashutosh, K., Girdhar, R., Torresani, L., Grauman, K.: HierVL: learning hierarchical video-language embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23066\u201323078 (2023)","DOI":"10.1109\/CVPR52729.2023.02209"},{"key":"9_CR3","unstructured":"Black, S., Gao, L., Wang, P., Leahy, C., Biderman, S.: GPT-NEO: large scale autoregressive language modeling with mesh-tensorflow. If you use this software, please cite it using these metadata, vol. 58 (2021)"},{"key":"9_CR4","doi-asserted-by":"publisher","unstructured":"Black, S., Gao, L., Wang, P., Leahy, C., Biderman, S.: GPT-NEO: large scale autoregressive language modeling with mesh-tensorflow (2021). https:\/\/doi.org\/10.5281\/zenodo.5297715, If you use this software, please cite it using these metadata","DOI":"10.5281\/zenodo.5297715"},{"key":"9_CR5","unstructured":"Bommasani, R., et\u00a0al.: On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)"},{"key":"9_CR6","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Carbonell, J., Goldstein, J.: The use of MMR, diversity-based reranking for reordering documents and producing summaries. In: Proceedings of the 21st Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 335\u2013336 (1998)","DOI":"10.1145\/290941.291025"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"9_CR9","unstructured":"Chen, G., et\u00a0al.: Videollm: modeling video sequence with large language models. arXiv preprint arXiv:2305.13292 (2023)"},{"key":"9_CR10","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: International Conference on Machine Learning, pp. 1931\u20131942. PMLR (2021)"},{"key":"9_CR11","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning (2023)"},{"issue":"11","key":"9_CR12","doi-asserted-by":"publisher","first-page":"4125","DOI":"10.1109\/TPAMI.2020.2991965","volume":"43","author":"D Damen","year":"2020","unstructured":"Damen, D., et al.: The epic-kitchens dataset: collection, challenges and baselines. IEEE Trans. Pattern Anal. Mach. Intell. 43(11), 4125\u20134141 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"3","key":"9_CR13","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1145\/363958.363994","volume":"7","author":"FJ Damerau","year":"1964","unstructured":"Damerau, F.J.: A technique for computer detection and correction of spelling errors. Commun. ACM 7(3), 171\u2013176 (1964)","journal-title":"Commun. ACM"},{"key":"9_CR14","unstructured":"Das, S., Ryoo, M.S.: Video+ clip baseline for ego4d long-term action anticipation. arXiv preprint arXiv:2207.00579 (2022)"},{"key":"9_CR15","doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: Multiscale vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6824\u20136835 (2021)","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"9_CR16","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Ramanan, D., Gupta, A., Sivic, J., Russell, B.: Actionvlad: learning spatio-temporal aggregation for action classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 971\u2013980 (2017)","DOI":"10.1109\/CVPR.2017.337"},{"key":"9_CR18","unstructured":"Grauman, K., et\u00a0al.: Ego4D: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"key":"9_CR19","unstructured":"Hu, E.J., et\u00a0al.: Lora: low-rank adaptation of large language models. In: International Conference on Learning Representations (2021)"},{"key":"9_CR20","unstructured":"Huang, D., Hilliges, O., Van\u00a0Gool, L., Wang, X.: Palm: predicting actions through language models@ ego4d long-term action anticipation challenge 2023. arXiv preprint arXiv:2306.16545 (2023)"},{"key":"9_CR21","unstructured":"Huang, W., Abbeel, P., Pathak, D., Mordatch, I.: Language models as zero-shot planners: extracting actionable knowledge for embodied agents. In: International Conference on Machine Learning, pp. 9118\u20139147. PMLR (2022)"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Hussein, N., Gavves, E., Smeulders, A.W.: Timeception for complex action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 254\u2013263 (2019)","DOI":"10.1109\/CVPR.2019.00034"},{"key":"9_CR23","unstructured":"Hussein, N., Gavves, E., Smeulders, A.W.: Videograph: recognizing minutes-long human activities in videos. arXiv preprint arXiv:1905.05143 (2019)"},{"key":"9_CR24","unstructured":"Ichter, B., et\u00a0al.: Do as i can, not as i say: grounding language in robotic affordances. In: 6th Annual Conference on Robot Learning (2022)"},{"key":"9_CR25","unstructured":"Keunwoo\u00a0Peter, Y.: Videoblip is a large vision-language model based on blip-2 that can generate texts conditioned on videos (2021). https:\/\/github.com\/yukw777\/VideoBLIP, If you use this software, please cite it using these metadata"},{"issue":"13","key":"9_CR26","doi-asserted-by":"publisher","first-page":"3521","DOI":"10.1073\/pnas.1611835114","volume":"114","author":"J Kirkpatrick","year":"2017","unstructured":"Kirkpatrick, J., et al.: Overcoming catastrophic forgetting in neural networks. Proc. Nat. Acad. Sci. 114(13), 3521\u20133526 (2017)","journal-title":"Proc. Nat. Acad. Sci."},{"key":"9_CR27","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.cviu.2016.09.001","volume":"154","author":"M Leo","year":"2017","unstructured":"Leo, M., Medioni, G., Trivedi, M., Kanade, T., Farinella, G.M.: Computer vision for assistive technologies. Comput. Vis. Image Underst. 154, 1\u201315 (2017)","journal-title":"Comput. Vis. Image Underst."},{"key":"9_CR28","unstructured":"Levenshtein, V.I., et\u00a0al.: Binary codes capable of correcting deletions, insertions, and reversals. In: Soviet Physics Doklady, vol.\u00a010, pp. 707\u2013710. Soviet Union (1966)"},{"key":"9_CR29","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"9_CR30","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"9_CR31","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: Visualbert: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"9_CR32","first-page":"31199","volume":"35","author":"S Li","year":"2022","unstructured":"Li, S., et al.: Pre-trained language models for interactive decision-making. Adv. Neural. Inf. Process. Syst. 35, 31199\u201331212 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Li, Y., Liu, M., Rehg, J.M.: In the eye of beholder: joint learning of gaze and actions in first person video. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 619\u2013635 (2018)","DOI":"10.1007\/978-3-030-01228-1_38"},{"key":"9_CR34","unstructured":"Lin, K.Q., et al.: Egocentric video-language pretraining. Adv. Neural Inf. Process. Syst. 35, 7575\u20137586 (2022)"},{"key":"9_CR35","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"9_CR36","doi-asserted-by":"crossref","unstructured":"Mascar\u00f3, E.V., Ahn, H., Lee, D.: Intention-conditioned long-term human egocentric action anticipation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 6048\u20136057 (2023)","DOI":"10.1109\/WACV56688.2023.00599"},{"key":"9_CR37","doi-asserted-by":"publisher","first-page":"169","DOI":"10.1016\/j.jvcir.2018.01.009","volume":"51","author":"G Meditskos","year":"2018","unstructured":"Meditskos, G., Plans, P.M., Stavropoulos, T.G., Benois-Pineau, J., Buso, V., Kompatsiaris, I.: Multi-modal activity recognition from egocentric vision, semantic enrichment and lifelogging applications for the care of dementia. J. Vis. Commun. Image Represent. 51, 169\u2013190 (2018)","journal-title":"J. Vis. Commun. Image Represent."},{"key":"9_CR38","doi-asserted-by":"crossref","unstructured":"Nagarajan, T., Li, Y., Feichtenhofer, C., Grauman, K.: Ego-topo: environment affordances from egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 163\u2013172 (2020)","DOI":"10.1109\/CVPR42600.2020.00024"},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Nakazawa, A., Honda, M.: First-person camera system to evaluate tender dementia-care skill. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00544"},{"key":"9_CR40","unstructured":"Naveed, H., et al.: A comprehensive overview of large language models. arXiv preprint arXiv:2307.06435 (2023)"},{"key":"9_CR41","doi-asserted-by":"publisher","unstructured":"Nawhal, M., Jyothi, A.A., Mori, G.: Rethinking learning approaches for long-term action anticipation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13694, pp. 558\u2013576. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19830-4_32","DOI":"10.1007\/978-3-031-19830-4_32"},{"key":"9_CR42","doi-asserted-by":"crossref","unstructured":"Neumann, L., Zisserman, A., Vedaldi, A.: Future event prediction: if and when. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (2019)","DOI":"10.1109\/CVPRW.2019.00354"},{"key":"9_CR43","unstructured":"OhnBar, E., Kitani, K., Asakawa, C.: Personalized dynamics models for adaptive assistive navigation systems. In: Conference on Robot Learning, pp. 16\u201339. PMLR (2018)"},{"key":"9_CR44","doi-asserted-by":"crossref","unstructured":"Pasca, R.G., et al.: Summarize the past to predict the future: natural language descriptions of context boost multimodal object interaction anticipation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18286\u201318296 (2024)","DOI":"10.1109\/CVPR52733.2024.01731"},{"key":"9_CR45","doi-asserted-by":"crossref","unstructured":"Patel, D., Eghbalzadeh, H., Kamra, N., Iuzzolino, M.L., Jain, U., Desai, R.: Pretrained language models as visual planners for human assistance. arXiv preprint arXiv:2304.09179 (2023)","DOI":"10.1109\/ICCV51070.2023.01404"},{"key":"9_CR46","doi-asserted-by":"crossref","unstructured":"Pramanick, S., et al.: EgoVLPv2: egocentric video-language pre-training with fusion in the backbone. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5285\u20135297 (2023)","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"9_CR47","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"9_CR48","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et\u00a0al.: Language models are unsupervised multitask learners (2019)"},{"key":"9_CR49","doi-asserted-by":"crossref","unstructured":"Ragusa, F., Farinella, G.M., Furnari, A.: Stillfast: an end-to-end approach for short-term object interaction anticipation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3635\u20133644 (2023)","DOI":"10.1109\/CVPRW59228.2023.00371"},{"key":"9_CR50","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2021.103252","volume":"211","author":"I Rodin","year":"2021","unstructured":"Rodin, I., Furnari, A., Mavroeidis, D., Farinella, G.M.: Predicting the future from first person (egocentric) vision: a survey. Comput. Vis. Image Underst. 211, 103252 (2021)","journal-title":"Comput. Vis. Image Underst."},{"key":"9_CR51","doi-asserted-by":"crossref","unstructured":"Ryoo, M.S., Fuchs, T.J., Xia, L., Aggarwal, J.K., Matthies, L.: Robot-centric activity prediction from first-person videos: what will they do to me? In: Proceedings of the Tenth Annual ACM\/IEEE International Conference on Human-Robot Interaction, pp. 295\u2013302 (2015)","DOI":"10.1145\/2696454.2696462"},{"key":"9_CR52","first-page":"16857","volume":"33","author":"K Song","year":"2020","unstructured":"Song, K., Tan, X., Qin, T., Lu, J., Liu, T.Y.: MPNet: masked and permuted pre-training for language understanding. Adv. Neural. Inf. Process. Syst. 33, 16857\u201316867 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR53","unstructured":"Touvron, H., et\u00a0al.: LLAMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"9_CR54","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"9_CR55","doi-asserted-by":"crossref","unstructured":"Wang, S., Zhao, Q., Do, M.Q., Agarwal, N., Lee, K., Sun, C.: Vamos: versatile action models for video understanding. arXiv preprint arXiv:2311.13627 (2023)","DOI":"10.1007\/978-3-031-73254-6_9"},{"key":"9_CR56","unstructured":"Wang, Y., et\u00a0al.: Internvideo: general video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"9_CR57","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: Simvlm: simple visual language model pretraining with weak supervision. In: International Conference on Learning Representations (2021)"},{"key":"9_CR58","doi-asserted-by":"crossref","unstructured":"Yao, Y., Xu, M., Choi, C., Crandall, D.J., Atkins, E.M., Dariush, B.: Egocentric vision-based future vehicle localization for intelligent driving assistance systems. In: 2019 International Conference on Robotics and Automation (ICRA), pp. 9711\u20139717. IEEE (2019)","DOI":"10.1109\/ICRA.2019.8794474"},{"key":"9_CR59","doi-asserted-by":"crossref","unstructured":"Ye, X., Iyer, S., Celikyilmaz, A., Stoyanov, V., Durrett, G., Pasunuru, R.: Complementary explanations for effective in-context learning. arXiv preprint arXiv:2211.13892 (2022)","DOI":"10.18653\/v1\/2023.findings-acl.273"},{"key":"9_CR60","unstructured":"Zeng, A., et\u00a0al.: Socratic models: composing zero-shot multimodal reasoning with language. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"9_CR61","doi-asserted-by":"crossref","unstructured":"Zhan, K., Faux, S., Ramos, F.: Multi-scale conditional random fields for first-person activity recognition. In: 2014 IEEE International Conference on Pervasive Computing and Communications (PerCom), pp. 51\u201359. IEEE (2014)","DOI":"10.1109\/PerCom.2014.6813944"},{"key":"9_CR62","unstructured":"Zhang, S., et\u00a0al.: OPT: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"9_CR63","unstructured":"Zhao, Q., et al.: AntGPT: can large language models help long-term action anticipation from videos? arXiv preprint arXiv:2307.16368 (2023)"},{"key":"9_CR64","unstructured":"Zhou, C., et\u00a0al.: A comprehensive survey on pretrained foundation models: a history from bert to chatgpt. arXiv preprint arXiv:2302.09419 (2023)"},{"key":"9_CR65","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J., Gao, J.: Unified vision-language pre-training for image captioning and vqa. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 13041\u201313049 (2020)","DOI":"10.1609\/aaai.v34i07.7005"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73007-8_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T22:37:18Z","timestamp":1732833438000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73007-8_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"ISBN":["9783031730061","9783031730078"],"references-count":65,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73007-8_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"assertion":[{"value":"1 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}