{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T16:22:55Z","timestamp":1780762975791,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,12]],"date-time":"2026-04-12T00:00:00Z","timestamp":1775952000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,12]]},"DOI":"10.1145\/3786167.3788414","type":"proceedings-article","created":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T11:40:19Z","timestamp":1779363619000},"page":"9-17","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Beyond Task Completion: An Assessment Framework for Evaluating Agentic AI Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6474-4496","authenticated-orcid":false,"given":"Sreemaee","family":"Akshathala","sequence":"first","affiliation":[{"name":"Software Engineering Research Center, IIIT Hyderabad, Hyderabad, Telangana, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0344-2111","authenticated-orcid":false,"given":"Bassam","family":"Adnan","sequence":"additional","affiliation":[{"name":"Software Engineering Research Center, IIIT Hyderabad, Hyderabad, Telangana, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0823-9048","authenticated-orcid":false,"given":"Mahisha","family":"Ramesh","sequence":"additional","affiliation":[{"name":"Software Engineering Research Center, IIIT Hyderabad, Hyderabad, Telangana, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2317-6175","authenticated-orcid":false,"given":"Karthik","family":"Vaidhyanathan","sequence":"additional","affiliation":[{"name":"Software Engineering Research Center, IIIT Hyderabad, India, Hyderabad, Telangana, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0683-5636","authenticated-orcid":false,"given":"Basil","family":"Muhammed","sequence":"additional","affiliation":[{"name":"MontyCloud, Bangalore, Karnataka, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2497-7639","authenticated-orcid":false,"given":"Kannan","family":"Parthasarathy","sequence":"additional","affiliation":[{"name":"MontyCloud, Bangalore, Karnataka, India"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,5,21]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Pierre Andrews Amine Benhalloum et\u00a0al. 2025. ARE: Scaling Up Agent Environments and Evaluations. arxiv:https:\/\/arXiv.org\/abs\/2509.17158\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2509.17158"},{"key":"e_1_3_3_2_3_2","unstructured":"Maksym Andriushchenko Alexandra Souly et\u00a0al. 2024. Agentharm: A benchmark for measuring harmfulness of llm agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.09024 (2024)."},{"key":"e_1_3_3_2_4_2","unstructured":"Mert Cemri Melissa\u00a0Z Pan Shuyi Yang et\u00a0al. 2025. Why do multi-agent llm systems fail? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.13657 (2025)."},{"key":"e_1_3_3_2_5_2","unstructured":"Yinfang Chen et\u00a0al. 2025. AIOpsLab: A Holistic Framework to Evaluate AI Agents for Enabling Autonomous Clouds. arxiv:https:\/\/arXiv.org\/abs\/2501.06706\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2501.06706"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Prateek Chhikara Dev Khant et\u00a0al. 2025. Mem0: Building Production-Ready AI Agents with Scalable Long-Term Memory. arxiv:https:\/\/arXiv.org\/abs\/2504.19413\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2504.19413","DOI":"10.3233\/FAIA251160"},{"key":"e_1_3_3_2_7_2","unstructured":"Liming Dong et\u00a0al. 2024. AgentOps: Enabling Observability of LLM Agents. arxiv:https:\/\/arXiv.org\/abs\/2411.05285\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2411.05285"},{"key":"e_1_3_3_2_8_2","unstructured":"Shiqing Fan Xichen Ding et\u00a0al. 2025. MCPToolBench++: A Large Scale AI Agent Model Context Protocol MCP Tool Use Benchmark. arxiv:https:\/\/arXiv.org\/abs\/2508.07575\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2508.07575"},{"key":"e_1_3_3_2_9_2","unstructured":"Jiawei Gu Xuhui Jiang et\u00a0al. 2025. A Survey on LLM-as-a-Judge. arxiv:https:\/\/arXiv.org\/abs\/2411.15594\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2411.15594"},{"key":"e_1_3_3_2_10_2","unstructured":"Mohammed\u00a0Mehedi Hasan Hao Li et\u00a0al. 2025. An Empirical Study of Testing Practices in Open Source AI Agent Frameworks and Agentic Applications. arxiv:https:\/\/arXiv.org\/abs\/2509.19185\u00a0[cs.SE] https:\/\/arxiv.org\/abs\/2509.19185"},{"key":"e_1_3_3_2_11_2","unstructured":"Ahmed\u00a0E. Hassan et\u00a0al. 2025. Agentic Software Engineering: Foundational Pillars and a Research Roadmap."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Sara\u00a0M Hezavehi et\u00a0al. 2021. Uncertainty in self-adaptive systems: A research community perspective. ACM Transactions on Autonomous and Adaptive Systems (TAAS) 15 4 (2021) 1\u201336.","DOI":"10.1145\/3487921"},{"key":"e_1_3_3_2_13_2","unstructured":"Yuanzhe Hu Yu Wang and Julian McAuley. 2025. Evaluating memory in llm agents via incremental multi-turn interactions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.05257 (2025)."},{"key":"e_1_3_3_2_14_2","unstructured":"Yedidel Louck Ariel Stulman and Amit Dvir. 2025. Improving Google A2A Protocol: Protecting Sensitive Data and Mitigating Unintended Harms in Multi-Agent Systems. arxiv:https:\/\/arXiv.org\/abs\/2505.12490\u00a0[cs.CR] https:\/\/arxiv.org\/abs\/2505.12490"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Qinghua Lu Liming Zhu Xiwei Xu Zhenchang Xing and Jon Whittle. 2024. Toward Responsible AI in the Era of Generative AI: A Reference Architecture for Designing Foundation Model-Based Systems. IEEE Software PP (11 2024) 1\u20137. doi:10.1109\/MS.2024.3406333","DOI":"10.1109\/MS.2024.3406333"},{"key":"e_1_3_3_2_16_2","unstructured":"Adyasha Maharana Dong-Ho Lee et\u00a0al. 2024. Evaluating very long-term conversational memory of llm agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.17753 (2024)."},{"key":"e_1_3_3_2_17_2","unstructured":"Dany Moshkovich and Sergey Zeltyn. 2025. Taming Uncertainty via Automation: Observing Analyzing and Optimizing Agentic AI Systems. arxiv:https:\/\/arXiv.org\/abs\/2507.11277\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2507.11277"},{"key":"e_1_3_3_2_18_2","unstructured":"Rudra Murthy et\u00a0al. 2025. KCIF: Knowledge-Conditioned Instruction Following. arxiv:https:\/\/arXiv.org\/abs\/2410.12972\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2410.12972"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ACSOS-C58168.2023.00048"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CAIN66642.2025.00031"},{"key":"e_1_3_3_2_21_2","unstructured":"Yunjia Qi Hao Peng et\u00a0al. 2025. Agentif: Benchmarking instruction following of large language models in agentic scenarios. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.16944 (2025)."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-032-02138-0_14"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511811654"},{"key":"e_1_3_3_2_24_2","unstructured":"Wangtao Sun et\u00a0al. 2024. Beyond Instruction Following: Evaluating Inferential Rule Following of Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2407.08440\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2407.08440"},{"key":"e_1_3_3_2_25_2","first-page":"1","volume-title":"2018 ieee 3rd international workshops on foundations and applications of self* systems (fas* w)","author":"Weyns Danny","year":"2018","unstructured":"Danny Weyns. 2018. Engineering self-adaptive software systems\u2013an organized tour. In 2018 ieee 3rd international workshops on foundations and applications of self* systems (fas* w). IEEE, 1\u20132."},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.679"},{"key":"e_1_3_3_2_27_2","unstructured":"Shaokun Zhang et\u00a0al. 2025. Which Agent Causes Task Failures and When? On Automated Failure Attribution of LLM Multi-Agent Systems. arxiv:https:\/\/arXiv.org\/abs\/2505.00212\u00a0[cs.MA] https:\/\/arxiv.org\/abs\/2505.00212"},{"key":"e_1_3_3_2_28_2","unstructured":"Zeyu Zhang Xiaohe Bo et\u00a0al. 2024. A Survey on the Memory Mechanism of Large Language Model based Agents. arxiv:https:\/\/arXiv.org\/abs\/2404.13501\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2404.13501"},{"key":"e_1_3_3_2_29_2","unstructured":"Mingchen Zhuge et\u00a0al. 2024. Agent-as-a-Judge: Evaluate Agents with Agents. arxiv:https:\/\/arXiv.org\/abs\/2410.10934\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2410.10934"}],"event":{"name":"AGENT '26: International Workshop on Agentic Engineering","location":"Rio de Janeiro Brazil","acronym":"AGENT '26","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","IEEE CS","Faculty of Engineering of University of Porto"]},"container-title":["Proceedings of the 2026 International Workshop on Agentic Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3786167.3788414","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T12:02:15Z","timestamp":1779364935000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3786167.3788414"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":28,"alternative-id":["10.1145\/3786167.3788414","10.1145\/3786167"],"URL":"https:\/\/doi.org\/10.1145\/3786167.3788414","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-05-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}