{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T06:39:12Z","timestamp":1777876752539,"version":"3.51.4"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032180698","type":"print"},{"value":"9783032180704","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-18070-4_6","type":"book-chapter","created":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T22:31:43Z","timestamp":1777588303000},"page":"89-98","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Benchmarking Considerations for Agentic AI Systems"],"prefix":"10.1007","author":[{"given":"Ajay","family":"Dholakia","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sachin Gopal","family":"Wani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"David","family":"Ellison","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Miro","family":"Hodak","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Debojyoti","family":"Dutta","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shishir","family":"Nagaraja","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Raj","family":"Ranjan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,4,1]]},"reference":[{"key":"6_CR1","unstructured":"Anthropic: Building Effective Agents. https:\/\/www.anthropic.com\/research\/building-effective-agents. Accessed 16 Sept 2025"},{"key":"6_CR2","unstructured":"IBM: What is agentic AI?. https:\/\/www.ibm.com\/think\/topics\/agentic-ai. Accessed 10 Oct 2025"},{"key":"6_CR3","unstructured":"IBM Research: The next frontier in AI evaluation: a survey of agent benchmarks. https:\/\/research.ibm.com\/blog\/AI-agent-benchmarks. Accessed 16 Sept 2025"},{"key":"6_CR4","unstructured":"Wang, L., et al.: A Survey on Evaluating Large Language Model based Agents. arXiv preprint arXiv:2308.11432v7 (2025)"},{"key":"6_CR5","unstructured":"Liu, X., et al.: AgentBench: evaluating LLMs as agents. arXiv preprint arXiv:2308.03688 (2023)"},{"key":"6_CR6","unstructured":"Jimenez, C.E., et al.: SWE-bench: can language models resolve real-world github issues?. In: Proceedings of the Twelfth International Conference on Learning Representations (2024)"},{"key":"6_CR7","unstructured":"Jha, S., et al.: ITBench: evaluating AI agents across diverse real-world IT Automation Tasks. arXiv preprint arXiv:2502.05352 (2025)"},{"key":"6_CR8","unstructured":"IBM Research: ITBench: a new yardstick to measure AI agents for IT automation. https:\/\/research.ibm.com\/blog\/it-agent-benchmark. Accessed 16 Sept 2025"},{"key":"6_CR9","unstructured":"Zhou, S., et al.: WebArena: a realistic web environment for building autonomous agents. arXiv preprint arXiv:2307.13854 (2023)"},{"key":"6_CR10","unstructured":"Smbatyan, K., et al.: Can AI agents design and implement drug discovery pipelines?. arXiv preprint arXiv:2504.19912 (2025)"},{"key":"6_CR11","unstructured":"Bloor Research: The Domino Effect - benchmarking agentic AI. https:\/\/www.bloorresearch.com\/the-domino-effect-benchmarking-agentic-ai\/. Accessed 16 Sept 2025"},{"key":"6_CR12","unstructured":"Patronus AI: Introducing TRAIL: A Benchmark for Agentic Evaluation. https:\/\/www.patronus.ai\/blog\/introducing-trail-a-benchmark-for-agentic-evaluation. Accessed 16 Sept 2025"},{"key":"6_CR13","doi-asserted-by":"crossref","unstructured":"Gioacchini, L., et al.: AgentQuest: A modular benchmark framework to measure progress and improve LLM agents. arXiv preprint arXiv:2406.06411 (2024)","DOI":"10.18653\/v1\/2024.naacl-demo.19"},{"key":"6_CR14","unstructured":"Zhang, Z., et al.: Agent-SafetyBench: evaluating the safety of LLM agents. arXiv preprint arXiv:2412.14470 (2024)"},{"key":"6_CR15","unstructured":"Andriushchenko, M., et al.: AgentHarm: a benchmark for measuring harmfulness of LLM agents. arXiv preprint arXiv:2410.09024 (2024)"},{"key":"6_CR16","doi-asserted-by":"crossref","unstructured":"Zhu, K., et al.: MultiAgentBench: evaluating the collaboration and competition of LLM agents. arXiv preprint arXiv:2503.01935 (2025)","DOI":"10.18653\/v1\/2025.acl-long.421"},{"key":"6_CR17","unstructured":"IBM: AI agent governance: Big challenges, big opportunities. https:\/\/www.ibm.com\/think\/insights\/ai-agent-governance. Accessed 09 Oct 2025"},{"key":"6_CR18","unstructured":"NIST: AI Risk Management Framework. https:\/\/www.nist.gov\/itl\/ai-risk-management-framework. Accessed 09 Oct 2025"},{"key":"6_CR19","doi-asserted-by":"publisher","unstructured":"Tabassi, E.: Artificial Intelligence Risk Management Framework (AI RMF 1.0). NIST AI 100\u20131, National Institute of Standards and Technology, Gaithersburg, MD (2023). https:\/\/doi.org\/10.6028\/NIST.AI.100-1","DOI":"10.6028\/NIST.AI.100-1"},{"key":"6_CR20","unstructured":"Nemko: ISO\/IEC JTC 1\/SC 42. https:\/\/digital.nemko.com\/standards\/iso-iec-jtc-1sc-42. Accessed 09 Oct 2025"},{"key":"6_CR21","unstructured":"UNESCO: How ISO and IEC are developing international standards for the responsible adoption of AI. https:\/\/www.unesco.org\/ethics-ai\/en\/articles\/how-iso-and-iec-are-developing-international-standards-responsible-adoption-ai. Accessed 09 Oct 2025"},{"key":"6_CR22","unstructured":"ANSI: Using AI Responsibly: U.S. Leads Efforts to Develop ISO\/IEC 42001. https:\/\/www.ansi.org\/standards-news\/all-news\/12-27-23-using-ai-responsibly-us-leads-efforts-to-develop-iso-iec-42001. Accessed 09 Oct 2025"},{"key":"6_CR23","unstructured":"CX Network: How to use agentic AI in line with the EU AI Act. https:\/\/www.cxnetwork.com\/artificial-intelligence\/articles\/how-to-use-agentic-ai-in-line-with-the-eu-ai-act. Accessed 09 Oct 2025"},{"key":"6_CR24","unstructured":"ISACA: The Growing Challenge of Auditing Agentic AI. https:\/\/www.isaca.org\/resources\/news-and-trends\/industry-news\/2025\/the-growing-challenge-of-auditing-agentic-ai. Accessed 09 Oct 2025"},{"key":"6_CR25","unstructured":"KPMG: A foundation for agentic AI. https:\/\/kpmg.com\/kpmg-us\/content\/dam\/kpmg\/pdf\/2025\/ai-governance-for-agentic-ai-era.pdf. Accessed 09 Oct 2025"},{"key":"6_CR26","unstructured":"Phelps: Agentic AI: Opportunities and Compliance Considerations for Community Banks. https:\/\/www.phelps.com\/insights\/agentic-ai-opportunities-and-compliance-considerations-for-community-banks.html. Accessed 09 Oct 2025"},{"key":"6_CR27","unstructured":"Rubrik: Take Control with Rubrik Agent Rewind. https:\/\/www.rubrik.com\/insights\/ai-issues-take-control-with-rubrik-agent-rewind. Accessed 09 Oct 2025"},{"key":"6_CR28","unstructured":"NexaStack: Ensure Trust with Traceability for Agentic AI. https:\/\/www.nexastack.ai\/blueprints\/agentic-ai-traceability\/. Accessed 09 Oct 2025"},{"key":"6_CR29","unstructured":"Anthropic: Petri: Open-source auditing for AI systems. https:\/\/www.anthropic.com\/research\/petri-open-source-auditing. Accessed 09 Oct 2025"},{"key":"6_CR30","unstructured":"Gardiner, J., Cova, M., Nagaraja, S.: Command & control: understanding, denying and detecting - a review of malware C2 techniques, detection and defences. arXiv preprint arXiv:1408.1136 (2015)"},{"key":"6_CR31","unstructured":"Bigelow, S. J.: TechTarget WhatIs? GPUs vs. TPUs vs. NPUs: comparing AI hardware options. https:\/\/www.techtarget.com\/whatis\/feature\/GPUs-vs-TPUs-vs-NPUs-Comparing-AI-hardware-options. Accessed 09 Oct 2025"},{"key":"6_CR32","unstructured":"Raveesh-Babu, S.: Advanced tracing and evaluation of generative AI agents using LangChain and Amazon SageMaker AI MLFlow. https:\/\/aws.amazon.com\/blogs\/machine-learning\/advanced-tracing-and-evaluation-of-generative-ai-agents-using-langchain-and-amazon-sagemaker-ai-mlflow\/. Accessed 09 Oct 2025"}],"container-title":["Lecture Notes in Computer Science","Performance Evaluation and Benchmarking"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-18070-4_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T22:31:46Z","timestamp":1777588306000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-18070-4_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032180698","9783032180704"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-18070-4_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"1 April 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TPCTC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Technology Conference on Performance Evaluation and Benchmarking","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"London","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tpctc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.tpc.org\/tpctc\/tpctc2025\/default5.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}