{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T18:40:22Z","timestamp":1780080022317,"version":"3.54.0"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733369","type":"print"},{"value":"9783031733376","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73337-6_11","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T23:02:27Z","timestamp":1730329347000},"page":"185-201","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["PreLAR: World Model Pre-training with\u00a0Learnable Action Representation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9230-9879","authenticated-orcid":false,"given":"Lixuan","family":"Zhang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9483-875X","authenticated-orcid":false,"given":"Meina","family":"Kan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8348-392X","authenticated-orcid":false,"given":"Shiguang","family":"Shan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3024-4404","authenticated-orcid":false,"given":"Xilin","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"11_CR1","unstructured":"Agarwal, R., Schwarzer, M., Castro, P.S., Courville, A.C., Bellemare, M.: Deep reinforcement learning at the edge of the statistical precipice. In: Advances in Neural Information Processing Systems (NeurIPS), vol.\u00a034, pp. 29304\u201329320 (2021)"},{"key":"11_CR2","unstructured":"Alemi, A.A., Fischer, I., Dillon, J.V., Murphy, K.: Deep variational information bottleneck. In: International Conference on Learning Representations (ICLR) (2017)"},{"issue":"47","key":"11_CR3","doi-asserted-by":"publisher","first-page":"29302","DOI":"10.1073\/pnas.1912341117","volume":"117","author":"KR Allen","year":"2020","unstructured":"Allen, K.R., Smith, K.A., Tenenbaum, J.B.: Rapid trial-and-error learning with simulation supports flexible tool use and physical reasoning. Proc. Natl. Acad. Sci. (PNAS) 117(47), 29302\u201329310 (2020)","journal-title":"Proc. Natl. Acad. Sci. (PNAS)"},{"key":"11_CR4","unstructured":"Baker, B., et al.: Video PreTraining (VPT): learning to act by watching unlabeled online videos. In: Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Bingham, E., Mannila, H.: Random projection in dimensionality reduction: applications to image and text data. In: ACM International Conference on Knowledge Discovery and Data Mining (KDD), pp. 245\u2013250 (2001)","DOI":"10.1145\/502512.502546"},{"key":"11_CR6","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 22563\u201322575 (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"11_CR7","unstructured":"Daniel, K.: Thinking, Fast and Slow. Macmillan (2011)"},{"key":"11_CR8","unstructured":"Deng, F., Park, J., Ahn, S.: Facing off world model backbones: RNNs, transformers, and S4. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems (NeurIPS), vol.\u00a036, pp. 72904\u201372930 (2023)"},{"key":"11_CR9","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\,\\times \\,$$16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR) (2021)"},{"key":"11_CR10","unstructured":"Du, Y., et al.: Video language planning. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"11_CR11","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The something something video database for learning and evaluating visual common sense. In: IEEE International Conference on Computer Vision (ICCV), pp. 5843\u20135851 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"11_CR12","unstructured":"Ha, D., Schmidhuber, J.: Recurrent world models facilitate policy evolution. In: Advances in Neural Information Processing Systems (NeurIPS), vol.\u00a031 (2018)"},{"key":"11_CR13","unstructured":"Ha, D., Schmidhuber, J.: World models. arXiv preprint arXiv:1803.10122 (2018)"},{"key":"11_CR14","unstructured":"Hafner, D., Lillicrap, T., Ba, J., Norouzi, M.: Dream to control: learning behaviors by latent imagination. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"11_CR15","unstructured":"Hafner, D., et al.: Learning latent dynamics for planning from pixels. In: International Conference on Machine Learning (ICML), pp. 2555\u20132565 (2019)"},{"key":"11_CR16","unstructured":"Hafner, D., Lillicrap, T.P., Norouzi, M., Ba, J.: Mastering Atari with discrete world models. In: International Conference on Learning Representations (ICLR) (2021)"},{"key":"11_CR17","unstructured":"Hafner, D., Pasukonis, J., Ba, J., Lillicrap, T.: Mastering diverse domains through world models. arXiv preprint arXiv:2301.04104 (2023)"},{"issue":"2","key":"11_CR18","doi-asserted-by":"publisher","first-page":"3019","DOI":"10.1109\/LRA.2020.2974707","volume":"5","author":"S James","year":"2020","unstructured":"James, S., Ma, Z., Arrojo, D.R., Davison, A.J.: RLBench: the robot learning benchmark & learning environment. IEEE Robot. Autom. Lett. (RAL) 5(2), 3019\u20133026 (2020)","journal-title":"IEEE Robot. Autom. Lett. (RAL)"},{"key":"11_CR19","unstructured":"Jia, F., et al.: ADriver-I: a general world model for autonomous driving. arXiv preprint arXiv:2311.13549 (2023)"},{"key":"11_CR20","unstructured":"Kaiser, \u0141., et al.: Model based reinforcement learning for Atari. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"11_CR21","unstructured":"Ko, P.C., Mao, J., Du, Y., Sun, S.H., Tenenbaum, J.B.: Learning to act from actionless videos through dense correspondences. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"11_CR22","unstructured":"LeCun, Y.: A path towards autonomous machine intelligence. Open Rev. 62 (2022)"},{"key":"11_CR23","unstructured":"Lu, C., et al.: Structured state space models for in-context reinforcement learning. In: ICML Workshop on New Frontiers in Learning, Control, and Dynamical Systems (2023)"},{"key":"11_CR24","unstructured":"Mattes, P., Schlosser, R., Herbrich, R.: Hieros: hierarchical imagination on structured state space sequence world models. arXiv preprint arXiv:2310.05167 (2023)"},{"key":"11_CR25","unstructured":"Micheli, V., Alonso, E., Fleuret, F.: Transformers are sample-efficient world models. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Moerland, T.M., Broekens, J., Plaat, A., Jonker, C.M., et\u00a0al.: Model-based reinforcement learning: a survey. Found. Trends\u00ae Mach. Learn. 16(1), 1\u2013118 (2023)","DOI":"10.1561\/2200000086"},{"key":"11_CR27","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018). https:\/\/www.mikecaptain.com\/resources\/pdf\/GPT-1.pdf"},{"key":"11_CR28","unstructured":"Robine, J., H\u00f6ftmann, M., Uelwer, T., Harmeling, S.: Transformer-based world models are happy with 100k interactions. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"11_CR30","unstructured":"Seo, Y., Lee, K., James, S.L., Abbeel, P.: Reinforcement learning with action-free pre-training from videos. In: International Conference on Machine Learning (ICML), pp. 19561\u201319579 (2022)"},{"key":"11_CR31","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhu, Z., Huang, G., Chen, X., Lu, J.: DriveDreamer: towards real-world-driven world models for autonomous driving. arXiv preprint arXiv:2309.09777 (2023)","DOI":"10.1007\/978-3-031-73195-2_4"},{"key":"11_CR32","doi-asserted-by":"crossref","unstructured":"Wang, Y., He, J., Fan, L., Li, H., Chen, Y., Zhang, Z.: Driving into the future: multiview visual forecasting and planning with world model for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14749\u201314759 (2024)","DOI":"10.1109\/CVPR52733.2024.01397"},{"key":"11_CR33","unstructured":"Wu, J., Ma, H., Deng, C., Long, M.: Pre-training contextualized world models with in-the-wild videos for reinforcement learning. In: Advances in Neural Information Processing Systems (NeurIPS), vol.\u00a036, pp. 39719\u201339743 (2023)"},{"key":"11_CR34","doi-asserted-by":"crossref","unstructured":"Xu, H., Zhang, J., Cai, J., Rezatofighi, H., Tao, D.: GMFlow: learning optical flow via global matching. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8121\u20138130 (2022)","DOI":"10.1109\/CVPR52688.2022.00795"},{"key":"11_CR35","unstructured":"Yu, T., et al.: Meta-World: a benchmark and evaluation for multi-task and meta reinforcement learning. In: Conference on Robot Learning (CoRL) (2019)"},{"key":"11_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Peng, Z., Zhou, B.: Learning to drive by watching Youtube videos: action-conditioned contrastive policy pretraining. In: European Conference on Computer Vision (ECCV), pp. 111\u2013128 (2022)","DOI":"10.1007\/978-3-031-19809-0_7"},{"key":"11_CR37","unstructured":"Zhang, W., Wang, G., Sun, J., Yuan, Y., Huang, G.: STORM: efficient stochastic transformer based world models for reinforcement learning. In: Advances in Neural Information Processing Systems (NeurIPS) (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73337-6_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T15:00:21Z","timestamp":1732978821000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73337-6_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031733369","9783031733376"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73337-6_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}