{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T15:48:44Z","timestamp":1778860124860,"version":"3.51.4"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729799","type":"print"},{"value":"9783031729805","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-72980-5_15","type":"book-chapter","created":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:15:43Z","timestamp":1730106943000},"page":"252-269","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":63,"title":["LingoQA: Visual Question Answering for\u00a0Autonomous Driving"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6494-6018","authenticated-orcid":false,"given":"Ana-Maria","family":"Marcu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4985-9516","authenticated-orcid":false,"given":"Long","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jan","family":"H\u00fcnermann","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alice","family":"Karnsund","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Benoit","family":"Hanotte","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Prajwal","family":"Chidananda","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Saurabh","family":"Nair","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vijay","family":"Badrinarayanan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alex","family":"Kendall","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jamie","family":"Shotton","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Elahe","family":"Arani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Oleg","family":"Sinavski","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,29]]},"reference":[{"key":"15_CR1","unstructured":"Partners for automated vehicle education. Pave poll 2020. https:\/\/pavecampaign.org\/pave-poll-americans-wary-of-avs-but-say-education-and-experience-with-technology-can-build-trust\/. Accessed 12 Oct 2023"},{"key":"15_CR2","unstructured":"What\u2019s going on with the open LLM leaderboard? https:\/\/huggingface.co\/blog\/evaluating-mmlu-leaderboard. Accessed 22 Oct 2023"},{"key":"15_CR3","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems (2022)"},{"key":"15_CR4","unstructured":"Arrieta, A.B., et al.: Explainable artificial intelligence (XAI): concepts, taxonomies, opportunities and challenges toward responsible (2019)"},{"key":"15_CR5","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372. Association for Computational Linguistics, Ann Arbor (2005). https:\/\/aclanthology.org\/W05-0909"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Bansal, M., Krizhevsky, A., Ogale, A.: ChauffeurNet: learning to drive by imitating the best and synthesizing the worst. arXiv preprint arXiv:1812.03079 (2018)","DOI":"10.15607\/RSS.2019.XV.031"},{"key":"15_CR7","unstructured":"Bao, H., et al.: VLMo: unified vision-language pre-training with mixture-of-modality-experts. In: Advances in Neural Information Processing Systems (2022). https:\/\/openreview.net\/forum?id=bydKs84JEyw"},{"key":"15_CR8","unstructured":"Brohan, A., et al.: RT-2: vision-language-action models transfer web knowledge to robotic control (2023)"},{"key":"15_CR9","unstructured":"Brohan, A., et al.: RT-1: robotics transformer for real-world control at scale (2023)"},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Chen, L., Wu, P., Chitta, K., Jaeger, B., Geiger, A., Li, H.: End-to-end autonomous driving: challenges and frontiers (2023)","DOI":"10.1109\/TPAMI.2024.3435937"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: Driving with LLMs: fusing object-level vector modality for explainable autonomous driving (2023)","DOI":"10.1109\/ICRA57147.2024.10611018"},{"key":"15_CR12","unstructured":"Chen, X., et al.: PaLI: a jointly-scaled multilingual language-image model. In: International Conference on Learning Representation (2023)"},{"key":"15_CR13","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Chib, P.S., Singh, P.: Recent advancements in end-to-end autonomous driving using deep learning: a survey (2023)","DOI":"10.1109\/TIV.2023.3318070"},{"key":"15_CR15","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Deruyttere, T., Vandenhende, S., Grujicic, D., Van\u00a0Gool, L., Moens, M.F.: Talk2Car: taking control of your self-driving car. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 2088\u20132098 (2019)","DOI":"10.18653\/v1\/D19-1215"},{"key":"15_CR17","unstructured":"Driess, D., et al.: PaLM-E: an embodied multimodal language model (2023)"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Gao, J., et al.: VectorNet: encoding HD maps and agent dynamics from vectorized representation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11525\u201311533 (2020)","DOI":"10.1109\/CVPR42600.2020.01154"},{"key":"15_CR19","unstructured":"Hawke, J., Haibo, E., Badrinarayanan, V., Kendall, A.: Reimagining an autonomous vehicle (2021)"},{"key":"15_CR20","unstructured":"He, P., Gao, J., Chen, W.: DeBERTaV3: improving DeBERTa using ELECTRA-style pre-training with gradient-disentangled embedding sharing (2023)"},{"key":"15_CR21","unstructured":"Hu, A., et al.: Model-based imitation learning for urban driving. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) Advances in Neural Information Processing Systems, vol.\u00a035, pp. 20703\u201320716. Curran Associates, Inc. (2022). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/827cb489449ea216e4a257c47e407d18-Paper-Conference.pdf"},{"key":"15_CR22","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models (2021)"},{"key":"15_CR23","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"15_CR24","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: OpenCLIP (2021). https:\/\/doi.org\/10.5281\/zenodo.5143773","DOI":"10.5281\/zenodo.5143773"},{"key":"15_CR25","unstructured":"Jain, S., Wallace, B.C.: Attention is not explanation. arXiv preprint arXiv:1902.10186 (2019)"},{"key":"15_CR26","unstructured":"Jiang, A.Q., et al.: Mistral 7B (2023)"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Jin, B., et al.: ADAPT: action-aware driving caption transformer (2023)","DOI":"10.1109\/ICRA48891.2023.10160326"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Kim, J., Canny, J.: Interpretable learning for self-driving cars by visualizing causal attention (2017)","DOI":"10.1109\/ICCV.2017.320"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Kim, J., Rohrbach, A., Darrell, T., Canny, J., Akata, Z.: Textual explanations for self-driving vehicles (2018)","DOI":"10.1007\/978-3-030-01216-8_35"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"Li, J., Niu, L., Zhang, L.: From representation to reasoning: towards both evidence and commonsense reasoning for video question-answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.02059"},{"key":"15_CR31","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models (2023)"},{"key":"15_CR32","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381. Association for Computational Linguistics, Barcelona (2004). https:\/\/aclanthology.org\/W04-1013"},{"key":"15_CR33","doi-asserted-by":"crossref","unstructured":"Lin, S., Hilton, J., Evans, O.: TruthfulQA: measuring how models mimic human falsehoods (2022)","DOI":"10.18653\/v1\/2022.acl-long.229"},{"key":"15_CR34","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning (2023)"},{"key":"15_CR35","doi-asserted-by":"crossref","unstructured":"Liu, Y., Iter, D., Xu, Y., Wang, S., Xu, R., Zhu, C.: G-Eval: NLG evaluation using GPT-4 with better human alignment (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"15_CR36","unstructured":"Mao, J., Qian, Y., Zhao, H., Wang, Y.: GPT-Driver: learning to drive with GPT. arXiv preprint arXiv:2310.01415 (2023)"},{"key":"15_CR37","unstructured":"OpenAI: GPT-4 technical report (2023)"},{"key":"15_CR38","doi-asserted-by":"publisher","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318. Association for Computational Linguistics, Philadelphia (2002). https:\/\/doi.org\/10.3115\/1073083.1073135. https:\/\/aclanthology.org\/P02-1040","DOI":"10.3115\/1073083.1073135"},{"key":"15_CR39","unstructured":"The Language Archive: ELAN. Max Planck Institute for Psycholinguistics, Nijmegen (2023). https:\/\/archive.mpi.nl\/tla\/elan"},{"key":"15_CR40","unstructured":"P\u0103tr\u0103ucean, V., et al.: Perception test: a diagnostic benchmark for multimodal video models. In: Advances in Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=HYEGXFnPoq"},{"key":"15_CR41","doi-asserted-by":"crossref","unstructured":"Qian, T., Chen, J., Zhuo, L., Jiao, Y., Jiang, Y.G.: NuScenes-QA: a multi-modal visual question answering benchmark for autonomous driving scenario. arXiv preprint arXiv:2305.14836 (2023)","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"15_CR42","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision (2021)"},{"key":"15_CR43","doi-asserted-by":"crossref","unstructured":"Sachdeva, E., et al.: Rank2Tell: a multimodal driving dataset for joint importance ranking and reasoning (2023)","DOI":"10.1109\/WACV57701.2024.00734"},{"key":"15_CR44","unstructured":"Sha, H., et al.: LanguageMPC: large language models as decision makers for autonomous driving (2023)"},{"key":"15_CR45","doi-asserted-by":"crossref","unstructured":"Sima, C., et al.: DriveLM: driving with graph visual question answering. arXiv preprint arXiv:2312.14150 (2023)","DOI":"10.1007\/978-3-031-72943-0_15"},{"key":"15_CR46","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models (2023)"},{"key":"15_CR47","unstructured":"Touvron, H., et al.: LLaMA 2: open foundation and fine-tuned chat models (2023)"},{"key":"15_CR48","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"15_CR49","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Image as a foreign language: BEiT pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"15_CR50","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, C.: SimVLM: simple visual language model pretraining with weak supervision. In: International Conference on Learning Representation (2022)"},{"key":"15_CR51","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models (2023)"},{"key":"15_CR52","unstructured":"Wen, L., et al.: On the road with GPT-4V(ision): early explorations of visual-language model on autonomous driving (2023)"},{"issue":"1","key":"15_CR53","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1145\/3434580","volume":"28","author":"W Xu","year":"2020","unstructured":"Xu, W.: From automation to autonomy and autonomous vehicles: challenges and opportunities for human-computer interaction. Interactions 28(1), 48\u201353 (2020). https:\/\/doi.org\/10.1145\/3434580","journal-title":"Interactions"},{"key":"15_CR54","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: Explainable object-induced action decision for autonomous vehicles (2020)","DOI":"10.1109\/CVPR42600.2020.00954"},{"key":"15_CR55","doi-asserted-by":"crossref","unstructured":"Xu, Z., et al.: DriveGPT4: interpretable end-to-end autonomous driving via large language model (2023)","DOI":"10.1109\/LRA.2024.3440097"},{"key":"15_CR56","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Unified contrastive learning in image-text-label space (2022)","DOI":"10.1109\/CVPR52688.2022.01857"},{"key":"15_CR57","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: contrastive captioners are image-text foundation models (2022)"},{"key":"15_CR58","unstructured":"Zhang, H., et al.: GLIPv2: unifying localization and vision-language understanding. In: Advances in Neural Information Processing Systems (2022)"},{"key":"15_CR59","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models (2022)"},{"key":"15_CR60","unstructured":"Zhao, B., Wu, B., Huang, T.: SVIT: scaling up visual instruction tuning (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72980-5_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T10:21:39Z","timestamp":1732962099000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72980-5_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031729799","9783031729805"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72980-5_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"29 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}