{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:59:43Z","timestamp":1772906383287,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,8]],"date-time":"2024-10-08T00:00:00Z","timestamp":1728345600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,8]]},"DOI":"10.1145\/3703412.3703439","type":"proceedings-article","created":{"date-parts":[[2025,3,5]],"date-time":"2025-03-05T11:49:51Z","timestamp":1741175391000},"page":"1-5","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Methodology for Quality Assurance Testing of LLM-based Multi-Agent Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-8971-1306","authenticated-orcid":false,"given":"Isha","family":"Shamim","sequence":"first","affiliation":[{"name":"TCS, West Lafayette, IN, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3712-1784","authenticated-orcid":false,"given":"Rekha","family":"Singhal","sequence":"additional","affiliation":[{"name":"TCS, New York, NY, United States"}]}],"member":"320","published-online":{"date-parts":[[2025,3,5]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"AgentOps. 2024. AgentOps. https:\/\/www.agentops.ai\/ Accessed: 2024-07-19."},{"key":"e_1_3_3_1_3_2","unstructured":"Data\u00a0Science at Microsoft. 2023. Evaluating LLM Systems: Metrics Challenges and Best Practices. https:\/\/medium.com\/data-science-at-microsoft\/evaluating-llm-systems-metrics-challenges-and-best-practices-664ac25be7e5 Accessed: 2024-07-19."},{"key":"e_1_3_3_1_4_2","unstructured":"Peter Clark Isaac Cowhey Oren Etzioni Tushar Khot Ashish Sabharwal Carissa Schoenick and Oyvind Tafjord. 2018. Think you have Solved Question Answering? Try ARC the AI2 Reasoning Challenge. arxiv:https:\/\/arXiv.org\/abs\/1803.05457\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/1803.05457"},{"key":"e_1_3_3_1_5_2","unstructured":"Cohere. 2024. Cohere. https:\/\/cohere.com\/ Accessed: 2024-07-19."},{"key":"e_1_3_3_1_6_2","unstructured":"CrewAI. 2024. CrewAI. https:\/\/www.crewai.com\/ Accessed: 2024-07-19."},{"key":"e_1_3_3_1_7_2","unstructured":"CrewAI Inc.2024. CrewAI Examples. https:\/\/github.com\/crewAIInc\/crewAI-examples Accessed: 2024-07-19."},{"key":"e_1_3_3_1_8_2","unstructured":"Google Cloud. 2024. Vertex AI. https:\/\/cloud.google.com\/vertex-ai?hl=en Accessed: 2024-07-19."},{"key":"e_1_3_3_1_9_2","unstructured":"Dan Hendrycks Collin Burns Steven Basart Andy Zou Mantas Mazeika Dawn Song and Jacob Steinhardt. 2021. Measuring Massive Multitask Language Understanding. arxiv:https:\/\/arXiv.org\/abs\/2009.03300\u00a0[cs.CY] https:\/\/arxiv.org\/abs\/2009.03300"},{"key":"e_1_3_3_1_10_2","unstructured":"LangChain. 2024. Langsmith. https:\/\/www.langchain.com\/langsmith Accessed: 2024-07-19."},{"key":"e_1_3_3_1_11_2","unstructured":"Percy Liang Rishi Bommasani Tony Lee Dimitris Tsipras Dilara Soylu Michihiro Yasunaga Yian Zhang Deepak Narayanan Yuhuai Wu Ananya Kumar Benjamin Newman Binhang Yuan Bobby Yan Ce Zhang Christian Cosgrove Christopher\u00a0D. Manning Christopher R\u00e9 Diana Acosta-Navas Drew\u00a0A. Hudson Eric Zelikman Esin Durmus Faisal Ladhak Frieda Rong Hongyu Ren Huaxiu Yao Jue Wang Keshav Santhanam Laurel Orr Lucia Zheng Mert Yuksekgonul Mirac Suzgun Nathan Kim Neel Guha Niladri Chatterji Omar Khattab Peter Henderson Qian Huang Ryan Chi Sang\u00a0Michael Xie Shibani Santurkar Surya Ganguli Tatsunori Hashimoto Thomas Icard Tianyi Zhang Vishrav Chaudhary William Wang Xuechen Li Yifan Mai Yuhui Zhang and Yuta Koreeda. 2023. Holistic Evaluation of Language Models. arxiv:https:\/\/arXiv.org\/abs\/2211.09110\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2211.09110"},{"key":"e_1_3_3_1_12_2","unstructured":"Stephanie Lin Jacob Hilton and Owain Evans. 2022. TruthfulQA: Measuring How Models Mimic Human Falsehoods. arxiv:https:\/\/arXiv.org\/abs\/2109.07958\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2109.07958"},{"key":"e_1_3_3_1_13_2","unstructured":"Microsoft. 2024. Azure AI. https:\/\/ai.azure.com\/ Accessed: 2024-07-19."},{"key":"e_1_3_3_1_14_2","unstructured":"Microsoft. 2024. PromptFlow. https:\/\/github.com\/microsoft\/promptflow Accessed: 2024-07-19."},{"key":"e_1_3_3_1_15_2","unstructured":"Aarohi Srivastava Abhinav Rastogi Abhishek Rao Abu Awal\u00a0Md Shoeb Abubakar Abid Adam Fisch Adam\u00a0R. Brown Adam Santoro Aditya Gupta and Adri\u00e0 Garriga-Alonso. 2023. Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models. arxiv:https:\/\/arXiv.org\/abs\/2206.04615\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2206.04615"},{"key":"e_1_3_3_1_16_2","unstructured":"Trulens. n. d.. Core Concepts: Feedback Functions. https:\/\/www.trulens.org\/trulens_eval\/getting_started\/core_concepts\/feedback_functions\/ Accessed: July 17 2024."},{"key":"e_1_3_3_1_17_2","unstructured":"Alex Wang Yada Pruksachatkun Nikita Nangia Amanpreet Singh Julian Michael Felix Hill Omer Levy and Samuel\u00a0R. Bowman. 2020. SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems. arxiv:https:\/\/arXiv.org\/abs\/1905.00537\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1905.00537"},{"key":"e_1_3_3_1_18_2","unstructured":"Alex Wang Amanpreet Singh Julian Michael Felix Hill Omer Levy and Samuel\u00a0R. Bowman. 2019. GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. arxiv:https:\/\/arXiv.org\/abs\/1804.07461\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1804.07461"},{"key":"e_1_3_3_1_19_2","unstructured":"Jize Wang Zerun Ma Yining Li Songyang Zhang Cailian Chen Kai Chen and Xinyi Le. 2024. GTA: A Benchmark for General Tool Agents. arxiv:https:\/\/arXiv.org\/abs\/2407.08713\u00a0[cs.CL] https:\/\/arxiv.org07.08713\/abs\/24"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Rowan Zellers Ari Holtzman Yonatan Bisk Ali Farhadi and Yejin Choi. 2019. HellaSwag: Can a Machine Really Finish Your Sentence? arxiv:https:\/\/arXiv.org\/abs\/1905.07830\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1905.07830","DOI":"10.18653\/v1\/P19-1472"}],"event":{"name":"AIMLSystems 2024: The 4th International Conference on AI-ML Systems","location":"Baton Rouge Louisiana USA","acronym":"AIMLSystems 2024"},"container-title":["Proceedings of the 4th International Conference on AI-ML Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3703412.3703439","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3703412.3703439","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:07Z","timestamp":1750295887000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3703412.3703439"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,8]]},"references-count":19,"alternative-id":["10.1145\/3703412.3703439","10.1145\/3703412"],"URL":"https:\/\/doi.org\/10.1145\/3703412.3703439","relation":{},"subject":[],"published":{"date-parts":[[2024,10,8]]},"assertion":[{"value":"2025-03-05","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}