{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:18:15Z","timestamp":1778080695740,"version":"3.51.4"},"publisher-location":"Cham","reference-count":99,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730320","type":"print"},{"value":"9783031730337","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73033-7_8","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:03:55Z","timestamp":1730333035000},"page":"129-148","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Embodied Understanding of\u00a0Driving Scenarios"],"prefix":"10.1007","author":[{"given":"Yunsong","family":"Zhou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Linyan","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingwen","family":"Bu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia","family":"Zeng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianyu","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hang","family":"Qiu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongzi","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongyang","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"issue":"6","key":"8_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3355390","volume":"52","author":"N Aafaq","year":"2019","unstructured":"Aafaq, N., Mian, A., Liu, W., Gilani, S.Z., Shah, M.: Video description: a survey of methods, datasets, and evaluation metrics. ACM Comput. Surv. (CSUR) 52(6), 1\u201337 (2019)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"8_CR2","unstructured":"Abu-El-Haija, S., et al.: Youtube-8m: a large-scale video classification benchmark. arXiv preprint arXiv:1609.08675 (2016)"},{"key":"8_CR3","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR4","unstructured":"Brohan, A., et\u00a0al.: RT-2: Vision-language-action models transfer web knowledge to robotic control. arXiv preprint arXiv:2307.15818 (2023)"},{"key":"8_CR5","unstructured":"Brohan, A., et\u00a0al.: Rt-1: robotics transformer for real-world control at scale. arXiv preprint arXiv:2212.06817 (2022)"},{"key":"8_CR6","unstructured":"Brown, T.B., et\u00a0al.: Language models are few-shot learners (2020)"},{"key":"8_CR7","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: Activitynet: a large-scale video benchmark for human activity understanding (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"8_CR8","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"Casas, S., Sadat, A., Urtasun, R.: Mp3: a unified model to map, perceive, predict and plan. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14403\u201314412 (2021)","DOI":"10.1109\/CVPR46437.2021.01417"},{"key":"8_CR10","doi-asserted-by":"crossref","unstructured":"Chen, G., et al.: Tem-adapter: adapting image-text pretraining for video question answer (2023)","DOI":"10.1109\/ICCV51070.2023.01282"},{"key":"8_CR11","unstructured":"Chen, L., et al.: Language models are visual reasoning coordinators. In: ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models (2023)"},{"key":"8_CR12","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: Driving with LLMS: fusing object-level vector modality for explainable autonomous driving. arXiv preprint arXiv:2310.01957 (2023)","DOI":"10.1109\/ICRA57147.2024.10611018"},{"key":"8_CR13","unstructured":"Chu, X., et\u00a0al.: Mobilevlm: a fast, reproducible and strong vision language assistant for mobile devices. arXiv preprint arXiv:2312.16886 (2023)"},{"key":"8_CR14","doi-asserted-by":"crossref","unstructured":"Chung, J.J.Y., Kamar, E., Amershi, S.: Increasing diversity while maintaining accuracy: text data generation with large language models and human interventions. arXiv preprint arXiv:2306.04140 (2023)","DOI":"10.18653\/v1\/2023.acl-long.34"},{"key":"8_CR15","unstructured":"Dauner, D., Hallgarten, M., Geiger, A., Chitta, K.: Parting with misconceptions about learning-based vehicle motion planning. arXiv preprint arXiv:2306.07962 (2023)"},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Deruyttere, T., Grujicic, D., Blaschko, M.B., Moens, M.F.: Talk2Car: predicting physical trajectories for natural language commands. IEEE Access (2022)","DOI":"10.1109\/ACCESS.2022.3224144"},{"key":"8_CR17","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"8_CR18","unstructured":"Dewangan, V., et al.: Talk2BEV: language-enhanced bird\u2019s-eye view maps for autonomous driving. arXiv preprint arXiv:2310.02251 (2023)"},{"key":"8_CR19","unstructured":"Ding, X., Han, J., Xu, H., Zhang, W., Li, X.: HiLM-D: towards high-resolution understanding in multimodal large language models for autonomous driving. arXiv preprint arXiv:2309.05186 (2023)"},{"key":"8_CR20","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"8_CR21","unstructured":"Dosovitskiy, A., Ros, G., Codevilla, F., Lopez, A., Koltun, V.: CARLA: an open urban driving simulator. In: Proceedings of the 1st Annual Conference on Robot Learning, pp. 1\u201316 (2017)"},{"key":"8_CR22","unstructured":"Driess, D., et\u00a0al.: PaLM-E: an embodied multimodal language model (2023)"},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Echterhoff, J., Yan, A., Han, K., Abdelraouf, A., Gupta, R., McAuley, J.: Driving through the concept gridlock: unraveling explainability bottlenecks. arXiv preprint arXiv:2310.16639 (2023)","DOI":"10.1109\/WACV57701.2024.00718"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Elhafsi, A., Sinha, R., Agia, C., Schmerling, E., Nesnas, I., Pavone, M.: Semantic anomaly detection with large language models (2023)","DOI":"10.1007\/s10514-023-10132-6"},{"key":"8_CR25","unstructured":"Fan, H., et al.: Baidu Apollo EM motion planner. arXiv preprint arXiv:1807.08048 (2018)"},{"key":"8_CR26","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: Eva: exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19358\u201319369 (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"8_CR27","unstructured":"Gao, P., et\u00a0al.: LLaMA-Adapter v2: parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)"},{"key":"8_CR28","unstructured":"Grauman, K., et\u00a0al.: Ego4d: around the world in 3,000 hours of egocentric video (2022)"},{"key":"8_CR29","unstructured":"Gu, J., et\u00a0al.: Robotic task generalization via hindsight trajectory sketches. In: First Workshop on Out-of-Distribution Generalization in Robotics at CoRL 2023 (2023)"},{"key":"8_CR30","unstructured":"Hao, Y., et al.: Language models are general-purpose interfaces. arXiv preprint arXiv:2206.06336 (2022)"},{"key":"8_CR31","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"8_CR32","doi-asserted-by":"crossref","unstructured":"Hu, P., Huang, A., Dolan, J., Held, D., Ramanan, D.: Safe local motion planning with self-supervised freespace forecasting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12732\u201312741 (2021)","DOI":"10.1109\/CVPR46437.2021.01254"},{"key":"8_CR33","doi-asserted-by":"crossref","unstructured":"Hu, S., Chen, L., Wu, P., Li, H., Yan, J., Tao, D.: St-p3: end-to-end vision-based autonomous driving via spatial-temporal feature learning (2022)","DOI":"10.1007\/978-3-031-19839-7_31"},{"key":"8_CR34","doi-asserted-by":"crossref","unstructured":"Hu, Y., et\u00a0al.: Planning-oriented autonomous driving (2023)","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"8_CR35","unstructured":"Huang, L., et al.: Leveraging vision-centric multi-modal expertise for 3D object detection. arXiv preprint arXiv:2310.15670 (2023)"},{"key":"8_CR36","unstructured":"Huang, S., et al., et\u00a0al.: Language is not all you need: aligning perception with language models. arXiv preprint arXiv:2302.14045 (2023)"},{"key":"8_CR37","doi-asserted-by":"crossref","unstructured":"Jin, B., et al.: Adapt: action-aware driving caption transformer (2023)","DOI":"10.1109\/ICRA48891.2023.10160326"},{"key":"8_CR38","doi-asserted-by":"crossref","unstructured":"Karamcheti, S., et al.: Language-Driven representation learning for robotics (2023)","DOI":"10.15607\/RSS.2023.XIX.032"},{"key":"8_CR39","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"8_CR40","unstructured":"Keysan, A., et al.: Can you text what is happening? integrating pre-trained language encoders into trajectory prediction models for autonomous driving. arXiv preprint arXiv:2309.05282 (2023)"},{"key":"8_CR41","series-title":"LNCS","first-page":"353","volume-title":"ECCV 2022","author":"T Khurana","year":"2022","unstructured":"Khurana, T., Hu, P., Dave, A., Ziglar, J., Held, D., Ramanan, D.: Differentiable raycasting for self-supervised occupancy forecasting. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13698, pp. 353\u2013369. Springer, Cham (2022)"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Kim, J., Misu, T., Chen, Y.T., Tawari, A., Canny, J.: Grounding human-to-vehicle advice for self-driving vehicles (2019)","DOI":"10.1109\/CVPR.2019.01084"},{"key":"8_CR43","doi-asserted-by":"crossref","unstructured":"Kim, J., Rohrbach, A., Darrell, T., Canny, J., Akata, Z.: Textual explanations for self-driving vehicles (2018)","DOI":"10.1007\/978-3-030-01216-8_35"},{"issue":"1\u20132","key":"8_CR44","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1002\/nav.3800020109","volume":"2","author":"HW Kuhn","year":"1955","unstructured":"Kuhn, H.W.: The Hungarian method for the assignment problem. Naval Res. Logist. Quart. 2(1\u20132), 83\u201397 (1955)","journal-title":"Naval Res. Logist. Quart."},{"key":"8_CR45","unstructured":"LeCun, Y.: A path towards autonomous machine intelligence version 0.9. 2, 2022-06-27. Open Review 62 (2022)"},{"key":"8_CR46","unstructured":"Li, B., et al.: MIMIC-IT: multi-modal in-context instruction tuning. arXiv preprint arXiv:2306.05425 (2023)"},{"key":"8_CR47","doi-asserted-by":"publisher","unstructured":"Li, H., et al.: Open-sourced data ecosystem in autonomous driving: the present and future (2023). https:\/\/doi.org\/10.13140\/RG.2.2.10945.74088","DOI":"10.13140\/RG.2.2.10945.74088"},{"key":"8_CR48","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models (2023)"},{"key":"8_CR49","unstructured":"Li, K., et al.: Videochat: chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"8_CR50","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-031-20077-9_1","volume-title":"ECCV 2022","author":"Z Li","year":"2022","unstructured":"Li, Z., et al.: BEVFormer: learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 1\u201318. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_1"},{"key":"8_CR51","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"8_CR52","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"8_CR53","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning (2023)"},{"key":"8_CR54","first-page":"11525","volume":"33","author":"F Locatello","year":"2020","unstructured":"Locatello, F., et al.: Object-centric learning with slot attention. Adv. Neural. Inf. Process. Syst. 33, 11525\u201311538 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR55","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"8_CR56","unstructured":"Lu, P., et al.: Learn to explain: multimodal reasoning via thought chains for science question answering (2022)"},{"key":"8_CR57","unstructured":"Majumdar, A., et\u00a0al.: Where are we in the search for an artificial visual cortex for embodied intelligence? arXiv preprint arXiv:2303.18240 (2023)"},{"key":"8_CR58","doi-asserted-by":"crossref","unstructured":"Malla, S., Choi, C., Dwivedi, I., Choi, J.H., Li, J.: DRAMA: joint risk localization and captioning in driving (2023)","DOI":"10.1109\/WACV56688.2023.00110"},{"key":"8_CR59","unstructured":"Mao, J., Qian, Y., Zhao, H., Wang, Y.: GPT-driver: learning to drive with GPT. arXiv preprint arXiv:2310.01415 (2023)"},{"key":"8_CR60","unstructured":"Mu, Y., et al.: Embodiedgpt: vision-language pre-training via embodied chain of thought. arXiv preprint arXiv:2305.15021 (2023)"},{"key":"8_CR61","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, K., Khan, F.: Video-ChatGPT: towards detailed video understanding via large vision and language models. arXiv:2306.05424 (2023)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"8_CR62","unstructured":"OpenAI, R.: Dall.e 3 system card (2023)"},{"key":"8_CR63","unstructured":"OpenAI, R.: GPT-4 technical report. arXiv pp. 2303\u201308774 (2023)"},{"key":"8_CR64","unstructured":"OpenAI, R.: GPT-4v(ision) system card (2023)"},{"key":"8_CR65","unstructured":"Padalkar, A., et\u00a0al.: Open x-embodiment: Robotic learning datasets and rt-x models. arXiv preprint arXiv:2310.08864 (2023)"},{"key":"8_CR66","unstructured":"Palo, N.D., Byravan, A., Hasenclever, L., Wulfmeier, M., Heess, N., Riedmiller, M.: Towards a unified agent with foundation models. arXiv preprint arXiv:2307.09668 (2023)"},{"key":"8_CR67","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"8_CR68","doi-asserted-by":"crossref","unstructured":"Qian, T., Chen, J., Zhuo, L., Jiao, Y., Jiang, Y.G.: NuScenes-QA: a multi-modal visual question answering benchmark for autonomous driving scenario. arXiv preprint arXiv:2305.14836 (2023)","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"8_CR69","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer (2020)"},{"key":"8_CR70","unstructured":"Regulation, G.D.P.: Art. 22 GDPR. automated individual decision-making, including profiling. Intersoft Consulting (2020)"},{"key":"8_CR71","doi-asserted-by":"crossref","unstructured":"Sachdeva, E., et al.: Rank2Tell: a multimodal driving dataset for joint importance ranking and reasoning. arXiv preprint arXiv:2309.06597 (2023)","DOI":"10.1109\/WACV57701.2024.00734"},{"key":"8_CR72","unstructured":"Sauer, A., Savinov, N., Geiger, A.: Conditional affordance learning for driving in urban environments. In: Conference on Robot Learning, pp. 237\u2013252. PMLR (2018)"},{"key":"8_CR73","doi-asserted-by":"crossref","unstructured":"Seff, A., et al.: MotionLM: multi-agent motion forecasting as language modeling (2023)","DOI":"10.1109\/ICCV51070.2023.00788"},{"key":"8_CR74","unstructured":"Sha, H., et al.: LanguageMPC: large language models as decision makers for autonomous driving. arXiv preprint arXiv:2310.03026 (2023)"},{"key":"8_CR75","unstructured":"Shah, D., et al.: VINT: a foundation model for visual navigation. arXiv preprint arXiv:2306.14846 (2023)"},{"key":"8_CR76","doi-asserted-by":"crossref","unstructured":"Sima, C., et al.: DriveLM: driving with graph visual question answering. arXiv preprint arXiv:2312.14150 (2023)","DOI":"10.1007\/978-3-031-72943-0_15"},{"key":"8_CR77","doi-asserted-by":"crossref","unstructured":"Song, E., et\u00a0al.: MovieChat: from dense token to sparse memory for long video understanding. arXiv preprint arXiv:2307.16449 (2023)","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"8_CR78","doi-asserted-by":"crossref","unstructured":"Sun, P., et\u00a0al.: Scalability in perception for autonomous driving: WAYMO open dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2446\u20132454 (2020)","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"8_CR79","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"8_CR80","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"8_CR81","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"8_CR82","doi-asserted-by":"crossref","unstructured":"Voigt, P., Von\u00a0dem Bussche, A.: The EU general data protection regulation (GDPR). A Practical Guide, 1st Ed. 10(3152676), 10\u20135555 (2017)","DOI":"10.1007\/978-3-319-57959-7_1"},{"key":"8_CR83","unstructured":"Wang, H., et\u00a0al.: OpenLane-V2: A topology reasoning benchmark for unified 3D HD mapping (2023)"},{"key":"8_CR84","unstructured":"Wang, J., et al.: Git: a generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)"},{"key":"8_CR85","doi-asserted-by":"crossref","unstructured":"Wang, P., Huang, X., Cheng, X., Zhou, D., Geng, Q., Yang, R.: The apolloscape open dataset for autonomous driving and its application. IEEE Trans. Pattern Anal. Mach. Intell. (2019)","DOI":"10.1109\/TPAMI.2019.2926463"},{"key":"8_CR86","unstructured":"Wang, Y., Guizilini, V.C., Zhang, T., Wang, Y., Zhao, H., Solomon, J.: Detr3D: 3D object detection from multi-view images via 3d-to-2d queries. In: Conference on Robot Learning, pp. 180\u2013191. PMLR (2022)"},{"key":"8_CR87","unstructured":"Wayve: Lingo-1 (2023). https:\/\/wayve.ai\/thinking\/lingo-natural-language-autonomous-driving\/"},{"key":"8_CR88","doi-asserted-by":"crossref","unstructured":"Wu, D., Han, W., Wang, T., Dong, X., Zhang, X., Shen, J.: Referring Multi-Object tracking (2023)","DOI":"10.1109\/CVPR52729.2023.01406"},{"key":"8_CR89","unstructured":"Wu, D., Han, W., Wang, T., Liu, Y., Zhang, X., Shen, J.: Language prompt for autonomous driving. arXiv preprint arXiv:2309.04379 (2023)"},{"key":"8_CR90","unstructured":"Xu, N., et al.: YouTube-VOS: a large-scale video object segmentation benchmark. arXiv preprint arXiv:1809.03327 (2018)"},{"key":"8_CR91","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: Explainable object-induced action decision for autonomous vehicles (2020)","DOI":"10.1109\/CVPR42600.2020.00954"},{"key":"8_CR92","doi-asserted-by":"crossref","unstructured":"Xu, Z., et al.: DriveGPT4: interpretable end-to-end autonomous driving via large language model. arXiv preprint arXiv:2310.01412 (2023)","DOI":"10.1109\/LRA.2024.3440097"},{"key":"8_CR93","unstructured":"Yang, Z., Jia, X., Li, H., Yan, J.: A survey of large language models for autonomous driving (2023)"},{"key":"8_CR94","doi-asserted-by":"crossref","unstructured":"Zeng, W., et al.: End-to-end interpretable neural motion planner. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8660\u20138669 (2019)","DOI":"10.1109\/CVPR.2019.00886"},{"key":"8_CR95","unstructured":"Zhai, Y., et al.: Investigating the catastrophic forgetting in multimodal large language models. arXiv preprint arXiv:2309.10313 (2023)"},{"key":"8_CR96","unstructured":"Zhang, P., Zeng, G., Wang, T., Lu, W.: Tinyllama: an open-source small language model. arXiv preprint arXiv:2401.02385 (2024)"},{"key":"8_CR97","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1007\/978-3-031-19809-0_7","volume-title":"ECCV 2022","author":"Q Zhang","year":"2022","unstructured":"Zhang, Q., Peng, Z., Zhou, B.: Learning to drive by watching YouTube videos: Action-conditioned contrastive policy pretraining. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13686, pp. 111\u2013128. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19809-0_7"},{"key":"8_CR98","unstructured":"Zhang, R., et al.: LLaMA-adapter: efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)"},{"key":"8_CR99","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"650","DOI":"10.1007\/978-3-031-20071-7_38","volume-title":"ECCV 2022","author":"H Zhu","year":"2022","unstructured":"Zhu, H., et al.: CelebV-HQ: a large-scale video facial attributes dataset. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13667, pp. 650\u2013667. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20071-7_38"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73033-7_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T15:06:38Z","timestamp":1732979198000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73033-7_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730320","9783031730337"],"references-count":99,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73033-7_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}