{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,29]],"date-time":"2026-03-29T01:10:53Z","timestamp":1774746653723,"version":"3.50.1"},"publisher-location":"Cham","reference-count":25,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031680304","type":"print"},{"value":"9783031680311","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-68031-1_3","type":"book-chapter","created":{"date-parts":[[2024,9,21]],"date-time":"2024-09-21T06:02:03Z","timestamp":1726898523000},"page":"34-43","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Benchmarking Generative AI Performance Requires a Holistic Approach"],"prefix":"10.1007","author":[{"given":"Ajay","family":"Dholakia","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"David","family":"Ellison","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Miro","family":"Hodak","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Debojyoti","family":"Dutta","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Carsten","family":"Binnig","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,9,22]]},"reference":[{"key":"3_CR1","unstructured":"MLPerf. https:\/\/mlcommons.org\/."},{"key":"3_CR2","unstructured":"TPCx-AI. https:\/\/www.tpc.org\/tpcx-ai\/default5.asp."},{"key":"3_CR3","unstructured":"Transaction Processing and Performance Council, \u201cTPC Express Benchmark \u2122 AI \u2013 Full Disclosure Report. (2022)"},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Dholakia, A., Ellison, D., Hodak, M., Dutta, D.: Going beyond speeds and feeds: benchmarking considerations for trustworthy and responsible AI. In: 14th TPC Technology Conference (2022)","DOI":"10.1007\/978-3-031-29576-8_8"},{"key":"3_CR5","unstructured":"Bommasani, R., et.al.: On the opportunities and risks of foundation models (2022). arXiv preprint https:\/\/arxiv.org\/pdf\/2108.07258.pdf."},{"key":"3_CR6","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding (2019). arXiv preprint arXiv:1810:04805v2"},{"key":"3_CR7","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language Models are Unsupervised Multitask Learners (2019)"},{"key":"3_CR8","unstructured":"Touvron, H., et al.: LlaMA: Open and efficient foundation language models (2023). arXiv: 2302.13971v1."},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Hodak, M., Van Buren, C., Jiang, X., Ellison, D., Dholakia, A.: Benchmarking large language models: opportunities and challenges. In: 15th TPC Technology Conference (2023)","DOI":"10.1007\/978-3-031-68031-1_6"},{"key":"3_CR10","unstructured":"Liang, P., et al.: Holistic Evaluation of Language Models (2022). arXiv:2211.09110. Accessed 28 Aug 2023"},{"key":"3_CR11","unstructured":"Ye, S., et al.: FLASK: Fine-grained Language Model Evaluation based on Alignment Skill Sets (2023).arXiv:2307.10928. Accessed 28 Aug 2023"},{"key":"3_CR12","doi-asserted-by":"publisher","unstructured":"Liu Olesiuk, Y., Hodak, M., Ellison, D., Dholakia, A.: More the merrier: comparative evaluation of TPCx-AI and MLPerf benchmarks for AI. In: Nambiar, R., Poess, M. (eds.) Performance Evaluation and Benchmarking, TPCTC 2022. Lecture Notes in Computer Science, vol 13860. Springer, Cham. https:\/\/doi.org\/10.1007\/978-3-031-29576-8_5","DOI":"10.1007\/978-3-031-29576-8_5"},{"key":"3_CR13","unstructured":"DataPerf Working Group. https:\/\/mlcommons.org\/en\/groups\/research-dataperf\/"},{"key":"3_CR14","doi-asserted-by":"crossref","unstructured":"Kiela, D., et al.: Dynabench: Rethinking Benchmarking in NLP (2021) arXiv:2104.14337. Accessed 28 Aug 2023","DOI":"10.18653\/v1\/2021.naacl-main.324"},{"key":"3_CR15","unstructured":"Beeching, E., et al.: Open LLM Leaderboard, Hugging Face (2023) https:\/\/huggingface.co\/spaces\/HuggingFaceH4\/open_llm_leaderboard."},{"key":"3_CR16","unstructured":"Zhao, W.X., et al.: A Survey of Large Language Models (2023) https:\/\/arxiv.org\/abs\/2303.18223."},{"key":"3_CR17","unstructured":"MLPerf Training. https:\/\/mlcommons.org\/en\/training-normal-07\/"},{"key":"3_CR18","unstructured":"MLPerf Inference. https:\/\/mlcommons.org\/en\/inference-datacenter-31\/"},{"key":"3_CR19","unstructured":"Lu, Q., et al.: Towards Responsible AI in the Era of ChatGPT: A Reference Architecture for Designing Foundation Model-based AI Systems (2023). https:\/\/arxiv.org\/abs\/2304.11090."},{"key":"3_CR20","doi-asserted-by":"crossref","unstructured":"Harrer, S.: Attention is not all you need: the complicated case of ethically using large language models in healthcare and medicine (2023) https:\/\/www.ncbi.nlm.nih.gov\/pmc\/articles\/PMC10025985\/.","DOI":"10.1016\/j.ebiom.2023.104512"},{"key":"3_CR21","unstructured":"Google. https:\/\/ai.google.com\/research\/NaturalQuestions"},{"key":"3_CR22","unstructured":"Karargyris, A., et al.: MedPerf: Open Benchmarking Platform for Medical Artificial Intelligence using Federated Evaluation (2021) arXiv:2110.01406, Accessed 28 Aug 2023"},{"key":"3_CR23","unstructured":"Saeed, M., De Cao, N., Papotti, P.: Querying Large Language Models with SQL (2023). CoRR abs\/2304.00472"},{"key":"3_CR24","unstructured":"Urban, M., Binnig, C.: Hybrid database operations: learned operations for seamless querying of textual and tabular data. In: LWDA, 8\u201311 (2022)"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Urban, M., Nguyen, D.D., Binnig, C.: OmniscientDB: a large language model-augmented DBMS that knows what other DBMSs do not know. In: aiDM '23: Proceedings of the Sixth International Workshop on Exploiting Artificial Intelligence Techniques for Data Management, pp. 1\u20137 (2023)","DOI":"10.1145\/3593078.3593933"}],"container-title":["Lecture Notes in Computer Science","Performance Evaluation and Benchmarking"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-68031-1_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T14:45:52Z","timestamp":1732805152000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-68031-1_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031680304","9783031680311"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-68031-1_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"22 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TPCTC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Technology Conference on Performance Evaluation and Benchmarking","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vancouver, BC","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Canada","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tpctc2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}