{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T16:06:26Z","timestamp":1775145986035,"version":"3.50.1"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733468","type":"print"},{"value":"9783031733475","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73347-5_17","type":"book-chapter","created":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:15:43Z","timestamp":1730106943000},"page":"292-308","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":30,"title":["Reason2Drive: Towards Interpretable and\u00a0Chain-Based Reasoning for\u00a0Autonomous Driving"],"prefix":"10.1007","author":[{"given":"Ming","family":"Nie","sequence":"first","affiliation":[]},{"given":"Renyuan","family":"Peng","sequence":"additional","affiliation":[]},{"given":"Chunwei","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xinyue","family":"Cai","sequence":"additional","affiliation":[]},{"given":"Jianhua","family":"Han","sequence":"additional","affiliation":[]},{"given":"Hang","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Li","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,29]]},"reference":[{"key":"17_CR1","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: ACL workshop (2005)"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"17_CR4","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: Unleashing multimodal LLM\u2019s referential dialogue magic. arXiv preprint (2023)"},{"key":"17_CR5","doi-asserted-by":"crossref","unstructured":"Chen, L., Wu, P., Chitta, K., Jaeger, B., Geiger, A., Li, H.: End-to-end autonomous driving: Challenges and frontiers. arXiv preprint (2023)","DOI":"10.1109\/TPAMI.2024.3435937"},{"key":"17_CR6","unstructured":"Chiang, W.L., et\u00a0al.: Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality. See https:\/\/vicuna.lmsys.org. Accessed 14 Apr 2023 (2023)"},{"key":"17_CR7","unstructured":"Chowdhery, A., et\u00a0al.: PaLM: Scaling language modeling with pathways. arXiv preprint (2022)"},{"key":"17_CR8","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint (2022)"},{"key":"17_CR9","unstructured":"Contributors, D.: DriveLM: Drive on language. https:\/\/github.com\/OpenDriveLab\/DriveLM (2023)"},{"key":"17_CR10","unstructured":"Dai, W., et al.: InstructBLIP: Towards general-purpose vision-language models with instruction tuning. arXiv preprint (2023)"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Deruyttere, T., Grujicic, D., Blaschko, M.B., Moens, M.F.: Talk2Car: predicting physical trajectories for natural language commands. IEEE Access 10, 123809\u2013123834 (2022)","DOI":"10.1109\/ACCESS.2022.3224144"},{"key":"17_CR12","unstructured":"Dewangan, V., et al.: Talk2Bev: Language-enhanced bird\u2019s-eye view maps for autonomous driving. arXiv preprint (2023)"},{"key":"17_CR13","unstructured":"Ding, X., Han, J., Xu, H., Zhang, W., Li, X.: HiLM-D: Towards high-resolution understanding in multimodal large language models for autonomous driving. arXiv preprint (2023)"},{"key":"17_CR14","unstructured":"Doll\u00e1r, K., Girshick, R.: Mask R-CNN. In: ICCV (2017)"},{"key":"17_CR15","unstructured":"Feng, Q., Ablavsky, V., Sclaroff, S.: CityFlow-NL: tracking and retrieval of vehicles at city scale by natural language descriptions. arXiv preprint (2021)"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Fu, D., et al.: Drive like a human: Rethinking autonomous driving with large language models. arXiv preprint (2023)","DOI":"10.1109\/WACVW60836.2024.00102"},{"key":"17_CR17","unstructured":"Golovneva, O., et al.: ROSCOE: A suite of metrics for scoring step-by-step reasoning. arXiv preprint (2022)"},{"key":"17_CR18","unstructured":"Han, J., et\u00a0al.: ImageBIND-LLM: Multi-modality instruction tuning. arXiv preprint (2023)"},{"key":"17_CR19","unstructured":"Hu, E.J., et al.: LoRA: Low-rank adaptation of large language models. arXiv preprint (2021)"},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Hu, Y., et\u00a0al.: Planning-oriented autonomous driving. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Jain, K., Chhangani, V., Tiwari, A., Krishna, K.M., Gandhi, V.: Ground then navigate: language-guided navigation in dynamic scenes. In: ICRA (2023)","DOI":"10.1109\/ICRA48891.2023.10160614"},{"key":"17_CR22","unstructured":"Jia, F., et al.: ADriver-I: A general world model for autonomous driving. arXiv preprint (2023)"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: LISA: Reasoning segmentation via large language model. arXiv preprint (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"17_CR24","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint (2023)"},{"key":"17_CR25","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NIPS (2023)"},{"key":"17_CR26","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2017)"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Malla, S., Choi, C., Dwivedi, I., Choi, J.H., Li, J.: DRAMA: joint risk localization and captioning in driving. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (2023)","DOI":"10.1109\/WACV56688.2023.00110"},{"key":"17_CR28","unstructured":"Mao, J., et\u00a0al.: One million scenes for autonomous driving: Once dataset. arXiv preprint (2021)"},{"key":"17_CR29","unstructured":"Mao, J., Qian, Y., Zhao, H., Wang, Y.: GPT-driver: Learning to drive with GPT. arXiv preprint (2023)"},{"key":"17_CR30","unstructured":"OpenAI: GPT-4: A large-scale transformer-based language model (2023). https:\/\/www.openai.com\/research\/gpt-4, https:\/\/www.openai.com\/research\/gpt-4"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"17_CR32","unstructured":"Peng, B., Li, C., He, P., Galley, M., Gao, J.: Instruction tuning with GPT-4. arXiv preprint (2023)"},{"key":"17_CR33","unstructured":"Peng, Z., et al.: Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint (2023)"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Pi, R., et al.: DetGPT: Detect what you need via reasoning. arXiv preprint (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.876"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Qian, T., Chen, J., Zhuo, L., Jiao, Y., Jiang, Y.G.: NuScenes-QA: A multi-modal visual question answering benchmark for autonomous driving scenario. arXiv preprint (2023)","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Sachdeva, E., et al.: Rank2Tell: A multimodal driving dataset for joint importance ranking and reasoning. arXiv preprint (2023)","DOI":"10.1109\/WACV57701.2024.00734"},{"key":"17_CR37","doi-asserted-by":"crossref","unstructured":"Sun, P., et\u00a0al.: Scalability in perception for autonomous driving: Waymo open dataset. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"17_CR38","unstructured":"Touvron, H., et\u00a0al.: Llama: Open and efficient foundation language models. arXiv preprint (2023)"},{"key":"17_CR39","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"17_CR40","doi-asserted-by":"crossref","unstructured":"Wu, D., Han, W., Wang, T., Dong, X., Zhang, X., Shen, J.: Referring multi-object tracking. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01406"},{"key":"17_CR41","unstructured":"Wu, D., Han, W., Wang, T., Liu, Y., Zhang, X., Shen, J.: Language prompt for autonomous driving. arXiv preprint (2023)"},{"key":"17_CR42","doi-asserted-by":"crossref","unstructured":"Xu, Z., et al.: DriveGPT4: Interpretable end-to-end autonomous driving via large language model. arXiv preprint (2023)","DOI":"10.1109\/LRA.2024.3440097"},{"key":"17_CR43","unstructured":"Yu, P., et al.: ALERT: Adapting language models to reasoning tasks. arXiv preprint (2022)"},{"key":"17_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-LLaMA: An instruction-tuned audio-visual language model for video understanding. arXiv preprint (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"17_CR45","unstructured":"Zhang, S., et\u00a0al.: OPT: Open pre-trained transformer language models. arXiv preprint (2022)"},{"key":"17_CR46","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73347-5_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:49:02Z","timestamp":1730108942000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73347-5_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,29]]},"ISBN":["9783031733468","9783031733475"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73347-5_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,29]]},"assertion":[{"value":"29 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}