{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T20:59:26Z","timestamp":1780779566072,"version":"3.54.1"},"reference-count":49,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T00:00:00Z","timestamp":1778544000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Journal of Systems and Software"],"published-print":{"date-parts":[[2026,10]]},"DOI":"10.1016\/j.jss.2026.112915","type":"journal-article","created":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T15:02:06Z","timestamp":1777993326000},"page":"112915","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":3,"special_numbering":"C","title":["Benchmarking contextual understanding for in-car conversational systems"],"prefix":"10.1016","volume":"240","author":[{"given":"Philipp","family":"Habicht","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1162-6252","authenticated-orcid":false,"given":"Lev","family":"Sorokin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4185-3504","authenticated-orcid":false,"given":"Abdullah","family":"Saydemir","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ken","family":"Friedl","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8956-3894","authenticated-orcid":false,"given":"Andrea","family":"Stocco","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"4","key":"10.1016\/j.jss.2026.112915_b1","doi-asserted-by":"crossref","first-page":"555","DOI":"10.1162\/coli.07-034-R2","article-title":"Inter-coder agreement for computational linguistics","volume":"34","author":"Artstein","year":"2008","journal-title":"Comput. Linguist."},{"key":"10.1016\/j.jss.2026.112915_b2","series-title":"Proceedings of the 47th International Conference on Software Engineering","first-page":"12","article-title":"Efficient domain augmentation for autonomous driving testing using diffusion models","author":"Baresi","year":"2025"},{"key":"10.1016\/j.jss.2026.112915_b3","series-title":"Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks","author":"Chen","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b4","series-title":"ReConcile: Round-table conference improves reasoning via consensus among diverse LLMs","author":"Chen","year":"2024"},{"key":"10.1016\/j.jss.2026.112915_b5","series-title":"Unleashing the potential of prompt engineering in large language models: a comprehensive review","author":"Chen","year":"2024"},{"key":"10.1016\/j.jss.2026.112915_b6","series-title":"Can large language models be an alternative to human evaluations?","author":"Chiang","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b7","series-title":"Proceedings of the 48th International Conference on Software Engineering Workshops","article-title":"Large language models for secure code assessment: A multi-language empirical study","author":"Dozono","year":"2026"},{"key":"10.1016\/j.jss.2026.112915_b8","series-title":"Improving factuality and reasoning in language models through multiagent debate","author":"Du","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b9","series-title":"Proceedings of the 18th Annual SIGdial Meeting on Discourse and Dialogue","article-title":"Key-value retrieval networks for task-oriented dialogue","author":"Eric","year":"2017"},{"key":"10.1016\/j.jss.2026.112915_b10","series-title":"Mmdialog: A large-scale multi-turn dialogue dataset towards multi-modal open-domain conversation","author":"Feng","year":"2022"},{"key":"10.1016\/j.jss.2026.112915_b11","series-title":"Inca: Rethinking in-car conversational system assessment leveraging large language models","author":"Friedl","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b12","series-title":"Foundation models in autonomous driving: A survey on scenario generation and scenario analysis","author":"Gao","year":"2025"},{"key":"10.1016\/j.jss.2026.112915_b13","series-title":"Proceedings of the 36th IEEE Intelligent Vehicles Symposium","first-page":"8","article-title":"Automated factual benchmarking for in-car conversational systems using large language models","author":"Giebisch","year":"2025"},{"key":"10.1016\/j.jss.2026.112915_b14","series-title":"MORTAR: Metamorphic multi-turn testing for LLM-based dialogue systems","author":"Guo","year":"2024"},{"key":"10.1016\/j.jss.2026.112915_b15","series-title":"Evaluating large language models: A comprehensive survey","author":"Guo","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b16","series-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","first-page":"7856","article-title":"Q2: Evaluating factual consistency in knowledge-grounded dialogues via question generation and question answering","author":"Honovich","year":"2021"},{"key":"10.1016\/j.jss.2026.112915_b17","series-title":"Human feedback is not gold standard","author":"Hosking","year":"2024"},{"key":"10.1016\/j.jss.2026.112915_b18","series-title":"Proceedings of 42nd International Conference on Software Engineering","first-page":"12","article-title":"Taxonomy of Real Faults in Deep Learning Systems","author":"Humbatova","year":"2020"},{"issue":"6","key":"10.1016\/j.jss.2026.112915_b19","doi-asserted-by":"crossref","first-page":"1274","DOI":"10.1093\/jcr\/ucx104","article-title":"Automated text analysis for consumer research","volume":"44","author":"Humphreys","year":"2018","journal-title":"J. Consum. Res."},{"key":"10.1016\/j.jss.2026.112915_b20","series-title":"Content Analysis","author":"Krippendorff","year":"2019"},{"key":"10.1016\/j.jss.2026.112915_b21","series-title":"Improving multi-agent debate with sparse communication topology","author":"Li","year":"2024"},{"key":"10.1016\/j.jss.2026.112915_b22","series-title":"Text Summarization Branches Out","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.jss.2026.112915_b23","series-title":"LLM-Eval: Unified multi-dimensional automatic evaluation for open-domain conversations with large language models","author":"Lin","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b24","series-title":"Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing","author":"Liu","year":"2021"},{"key":"10.1016\/j.jss.2026.112915_b25","series-title":"Mapbox API","author":"Mapbox","year":"2025"},{"key":"10.1016\/j.jss.2026.112915_b26","series-title":"Proceedings of the 18th IEEE International Conference on Software Testing, Verification and Validation","first-page":"12","article-title":"Benchmarking generative AI models for deep learning test input generation","author":"Maryam","year":"2025"},{"key":"10.1016\/j.jss.2026.112915_b27","series-title":"Transforming Conversational AI","first-page":"XIII, 228","author":"McTear","year":"2024"},{"key":"10.1016\/j.jss.2026.112915_b28","series-title":"The Society of Mind","first-page":"339","author":"Minsky","year":"1986"},{"key":"10.1016\/j.jss.2026.112915_b29","series-title":"Why we need new evaluation metrics for NLG","author":"Novikova","year":"2017"},{"key":"10.1016\/j.jss.2026.112915_b30","series-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.jss.2026.112915_b31","series-title":"WDC products: A multi-dimensional entity matching benchmark","author":"Peeters","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b32","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing","first-page":"7957","article-title":"Automatic prompt optimization with \u201cgradient descent\u201d and beam search","author":"Pryzant","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b33","doi-asserted-by":"crossref","first-page":"249","DOI":"10.1162\/tacl_a_00266","article-title":"Coqa: A conversational question answering challenge","volume":"7","author":"Reddy","year":"2019","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"10.1016\/j.jss.2026.112915_b34","doi-asserted-by":"crossref","first-page":"5193","DOI":"10.1007\/s10664-020-09881-0","article-title":"Testing machine learning based systems: a systematic mapping","volume":"25","author":"Riccio","year":"2020","journal-title":"Empir. Softw. Eng."},{"key":"10.1016\/j.jss.2026.112915_b35","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track","first-page":"586","article-title":"CarExpert: Leveraging large language models for in-car conversational question answering","author":"Rony","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b36","series-title":"CarExpert: Leveraging large language models for in-car conversational question answering","author":"Rony","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b37","series-title":"A systematic survey of prompt engineering in large language models: Techniques and applications","author":"Sahoo","year":"2024"},{"key":"10.1016\/j.jss.2026.112915_b38","series-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference","first-page":"615","article-title":"\u201cAlexa in the wild\u201d \u2013 collecting unconstrained conversations with a modern voice assistant in a public environment","author":"Siegert","year":"2020"},{"key":"10.1016\/j.jss.2026.112915_b39","series-title":"Proceedings of the 33rd IEEE International Conference on Software Analysis, Evolution and Reengineering","article-title":"STELLAR: A search-based testing framework for large language model applications","author":"Sorokin","year":"2026"},{"key":"10.1016\/j.jss.2026.112915_b40","series-title":"Proceedings of the 48th International Conference on Software Engineering Workshops","article-title":"DeepTest tool competition 2026: Benchmarking an LLM-based automotive assistant","author":"Sorokin","year":"2026"},{"key":"10.1016\/j.jss.2026.112915_b41","series-title":"Just ask for calibration: Strategies for eliciting calibrated confidence scores from language models fine-tuned with human feedback","author":"Tian","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b42","series-title":"Information Retrieval","author":"van Rijsbergen","year":"1979"},{"key":"10.1016\/j.jss.2026.112915_b43","series-title":"Unleashing the emergent cognitive synergy in large language models: A task-solving agent through multi-persona self-collaboration","author":"Wang","year":"2024"},{"key":"10.1016\/j.jss.2026.112915_b44","series-title":"Self-consistency improves chain of thought reasoning in language models","author":"Wang","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b45","series-title":"Chain-of-thought prompting elicits reasoning in large language models","author":"Wei","year":"2023"},{"key":"10.1016\/j.jss.2026.112915_b46","series-title":"Conversational question answering: A survey","author":"Zaib","year":"2021"},{"key":"10.1016\/j.jss.2026.112915_b47","doi-asserted-by":"crossref","unstructured":"Zang, X., Rastogi, A., Sunkara, S., Gupta, R., Zhang, J., Chen, J., 2020. MultiWOZ 2.2: A Dialogue Dataset with Additional Annotation Corrections and State Tracking Baselines. In: Proceedings of the 2nd Workshop on Natural Language Processing for Conversational AI. ACL 2020, pp. 109\u2013117.","DOI":"10.18653\/v1\/2020.nlp4convai-1.13"},{"key":"10.1016\/j.jss.2026.112915_b48","series-title":"BERTScore: Evaluating text generation with BERT","author":"Zhang","year":"2020"},{"key":"10.1016\/j.jss.2026.112915_b49","series-title":"Judging LLM-as-a-judge with MT-bench and chatbot arena","author":"Zheng","year":"2023"}],"container-title":["Journal of Systems and Software"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0164121226001482?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0164121226001482?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T20:23:40Z","timestamp":1780777420000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0164121226001482"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,10]]},"references-count":49,"alternative-id":["S0164121226001482"],"URL":"https:\/\/doi.org\/10.1016\/j.jss.2026.112915","relation":{},"ISSN":["0164-1212"],"issn-type":[{"value":"0164-1212","type":"print"}],"subject":[],"published":{"date-parts":[[2026,10]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Benchmarking contextual understanding for in-car conversational systems","name":"articletitle","label":"Article Title"},{"value":"Journal of Systems and Software","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.jss.2026.112915","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Authors. Published by Elsevier Inc.","name":"copyright","label":"Copyright"}],"article-number":"112915"}}