{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:18:13Z","timestamp":1763191093009,"version":"3.45.0"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228172","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-9","source":"Crossref","is-referenced-by-count":0,"title":["PromptMetric: Prompt Recipe as an Automatic Metric for Evaluating Open-domain Question Answering Systems"],"prefix":"10.1109","author":[{"given":"Pengzhe","family":"Wang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China,School of Cyber Science and Technology,Hefei,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"Zeng","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,School of Cyber Science and Technology,Hefei,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chiwei","family":"Zhu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,School of Cyber Science and Technology,Hefei,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Benfeng","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,School of Cyber Science and Technology,Hefei,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhendong","family":"Mao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,School of Cyber Science and Technology,Hefei,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongdong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,School of Cyber Science and Technology,Hefei,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"volume-title":"Retrieving and reading: A comprehensive survey on open-domain question answering","author":"Zhu","key":"ref1"},{"volume-title":"SearchQA: A new q&a dataset augmented with context from a search engine","author":"Dunn","key":"ref2"},{"key":"ref3","first-page":"1601","article-title":"TriviaQA: A large scale distantly supervised challenge dataset for reading comprehension","volume-title":"Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Joshi"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00276"},{"key":"ref5","first-page":"3214","article-title":"TruthfulQA: Measuring how models mimic human falsehoods","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Lin"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00370"},{"key":"ref7","first-page":"5591","article-title":"Evaluating open-domain question answering in the era of large language models","volume-title":"Association for Computational Linguistics","author":"Kamalloo"},{"volume-title":"Evaluating open-QA evaluation","author":"Wang","key":"ref8"},{"key":"ref9","first-page":"1870","article-title":"Reading Wikipedia to answer open-domain questions","volume-title":"Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Chen"},{"key":"ref10","first-page":"845","article-title":"Simple and effective multi-paragraph reading comprehension","volume-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Clark"},{"key":"ref11","first-page":"6086","article-title":"Latent retrieval for weakly supervised open domain question answering","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","author":"Lee"},{"key":"ref12","first-page":"874","article-title":"Leveraging passage retrieval with generative models for open domain question answering","volume-title":"Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume","author":"Izacard"},{"key":"ref13","first-page":"2226","article-title":"Evidentiality-guided generation for knowledge-intensive NLP tasks","volume-title":"Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"Asai"},{"key":"ref14","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/D16-1264","article-title":"Squad: 100,000+ questions for machine comprehension of text","author":"Rajpurkar","year":"2016"},{"article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","year":"2021","author":"Lewis","key":"ref15"},{"key":"ref16","first-page":"3493","article-title":"Better zero-shot reasoning with self-adaptive prompting","volume-title":"Findings of the Association for Computational Linguistics: ACL 2023","author":"Wan"},{"key":"ref17","first-page":"119","article-title":"Evaluating question answering evaluation","volume-title":"Proceedings of the 2nd Workshop on Machine Reading for Question Answering","author":"Chen"},{"key":"ref18","first-page":"149","article-title":"Semantic answer similarity for evaluating question answering models","volume-title":"Proceedings of the 3rd Workshop on Machine Reading for Question Answering","author":"Risch"},{"key":"ref19","first-page":"291","article-title":"Tomayto, tomahto. beyond token-level answer equivalence for question answering evaluation","volume-title":"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing","author":"Bulian"},{"key":"ref20","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2023.acl-long.754","article-title":"Self-instruct: Aligning language models with self-generated instructions","author":"Wang","year":"2023"},{"article-title":"Wizardlm: Empowering large language models to follow complex instructions","year":"2023","author":"Xu","key":"ref21"},{"article-title":"#instag: Instruction tagging for analyzing supervised fine-tuning of large language models","year":"2023","author":"Lu","key":"ref22"},{"key":"ref23","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.acl-long.510","article-title":"Math-shepherd: Verify and reinforce llms step-by-step without human annotations","author":"Wang","year":"2024"},{"article-title":"Tree of thoughts: Deliberate problem solving with large language models","year":"2023","author":"Yao","key":"ref24"},{"article-title":"Let\u2019s verify step by step","year":"2023","author":"Lightman","key":"ref25"},{"key":"ref26","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2023.findings-emnlp.1022","article-title":"Grace: Discriminator-guided chain-of-thought reasoning","author":"Khalifa","year":"2023"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29720"},{"article-title":"Self-reflection in llm agents: Effects on problem-solving performance","year":"2024","author":"Renze","key":"ref28"},{"article-title":"Length-controlled alpacaeval: A simple way to debias automatic evaluators","year":"2024","author":"Dubois","key":"ref29"},{"article-title":"Alpacafarm: A simulation framework for methods that learn from human feedback","year":"2024","author":"Dubois","key":"ref30"},{"author":"Zheng","key":"ref31","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena"},{"article-title":"From live data to high-quality benchmarks: The arena-hard pipeline","year":"2024","author":"Li","key":"ref32"},{"article-title":"Mixeval: Deriving wisdom of the crowd from llm benchmark mixtures","year":"2024","author":"Ni","key":"ref33"},{"volume-title":"CFMatch: Aligning automated answer equivalence evaluation with expert judgments for open-domain question answering","author":"Li","key":"ref34"},{"article-title":"Gpt-4 technical report","year":"2024","author":"Achiam","key":"ref35"},{"article-title":"Qwen2 technical report","year":"2024","author":"Yang","key":"ref36"},{"article-title":"The llama 3 herd of models","year":"2024","author":"Grattafiori","key":"ref37"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228172.pdf?arnumber=11228172","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:14:42Z","timestamp":1763190882000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228172\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228172","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}