{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T15:29:18Z","timestamp":1776785358550,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706598.3713581","type":"proceedings-article","created":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T03:33:32Z","timestamp":1745465612000},"page":"1-15","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":26,"title":["Interactive Debugging and Steering of Multi-Agent AI Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2745-4315","authenticated-orcid":false,"given":"Will","family":"Epperson","sequence":"first","affiliation":[{"name":"Human-Computer Interaction Institute, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7741-3861","authenticated-orcid":false,"given":"Gagan","family":"Bansal","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, Washington, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1839-5632","authenticated-orcid":false,"given":"Victor C","family":"Dibia","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, Washington, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4986-7794","authenticated-orcid":false,"given":"Adam","family":"Fourney","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, Washington, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1966-2434","authenticated-orcid":false,"given":"Jack","family":"Gerrits","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, Washington, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3326-1790","authenticated-orcid":false,"given":"Erkang (Eric)","family":"Zhu","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, Washington, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3294-7288","authenticated-orcid":false,"given":"Saleema","family":"Amershi","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, Washington, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-SEIP.2019.00042"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702509"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Ian Arawjo Chelse Swoopes Priyan Vaithilingam Martin Wattenberg and Elena Glassman. 2023. ChainForge: A Visual Toolkit for Prompt Engineering and LLM Hypothesis Testing. arxiv:https:\/\/arXiv.org\/abs\/2309.09128\u00a0[cs.HC]","DOI":"10.1145\/3613904.3642016"},{"key":"e_1_3_3_2_5_2","unstructured":"Autoblocks AI. 2024. Autoblocks. https:\/\/www.autoblocks.ai. Accessed: 2024-12-02."},{"key":"e_1_3_3_2_6_2","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems , H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.F. Balcan, and H.\u00a0Lin (Eds.), Vol.\u00a033. Curran Associates, Inc., Red Hook, NY, USA, 1877\u20131901. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581268"},{"key":"e_1_3_3_2_8_2","unstructured":"Furui Cheng Vil\u00e9m Zouhar Robin Shing\u00a0Moon Chan Daniel F\u00fcrst Hendrik Strobelt and Mennatallah El-Assady. 2024. Interactive Analysis of LLMs using Meaningful Counterfactuals. arxiv:https:\/\/arXiv.org\/abs\/2405.00708\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2405.00708"},{"key":"e_1_3_3_2_9_2","unstructured":"Yuheng Cheng Ceyao Zhang Zhengwen Zhang Xiangrui Meng Sirui Hong Wenhao Li Zihao Wang Zekai Wang Feng Yin Junhua Zhao and Xiuqiang He. 2024. Exploring Large Language Model based Intelligent Agents: Definitions Methods and Prospects. arxiv:https:\/\/arXiv.org\/abs\/2401.03428\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2401.03428"},{"key":"e_1_3_3_2_10_2","unstructured":"CrewAI. 2024. CrewAI. https:\/\/www.crewai.com\/. Accessed: 2024-12-02."},{"key":"e_1_3_3_2_11_2","unstructured":"Victor Dibia Jingya Chen Gagan Bansal Suff Syed Adam Fourney Erkang Zhu Chi Wang and Saleema Amershi. 2024. AutoGen Studio: A No-Code Developer Tool for Building and Debugging Multi-Agent Systems. arxiv:https:\/\/arXiv.org\/abs\/2408.15247\u00a0[cs.SE] https:\/\/arxiv.org\/abs\/2408.15247"},{"key":"e_1_3_3_2_12_2","unstructured":"Hugging Face. 2024. GAIA Benchmark Leaderboard. https:\/\/huggingface.co\/spaces\/gaia-benchmark\/leaderboard Accessed 08-2024."},{"key":"e_1_3_3_2_13_2","volume-title":"Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks","author":"Fourney Adam","year":"2024","unstructured":"Adam Fourney, Gagan Bansal, Hussein Mozannar, Cheng Tan, Eduardo Salinas, Erkang\u00a0(Eric) Zhu, Friederike Niedtner, Grace Proebsting, Griffin Bassman, Jack Gerrits, Jacob Alber, Peter Chang, Ricky Loynd, Robert West, Victor Dibia, Ahmed Awadallah, Ece Kamar, Rafah Hosn, and Saleema Amershi. 2024. Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks. Technical Report MSR-TR-2024-47. Microsoft. https:\/\/www.microsoft.com\/en-us\/research\/publication\/magentic-one-a-generalist-multi-agent-system-for-solving-complex-tasks\/"},{"key":"e_1_3_3_2_14_2","unstructured":"GitKraken. 2024. GitKraken Commit Graph: Bring color & clarity to your commit history. https:\/\/www.gitkraken.com\/solutions\/commit-graph. Accessed: 2024-09."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/890"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.371"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491101.3503564"},{"key":"e_1_3_3_2_18_2","unstructured":"Carlos\u00a0E Jimenez John Yang Alexander Wettig Shunyu Yao Kexin Pei Ofir Press and Karthik\u00a0R Narasimhan. 2024. SWE-bench: Can Language Models Resolve Real-world Github Issues?https:\/\/openreview.net\/forum?id=VTF8yNQM66"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","unstructured":"Minsuk Kahng Ian Tenney Mahima Pushkarna Michael\u00a0Xieyang Liu James Wexler Emily Reif Krystal Kallarackal Minsuk Chang Michael Terry and Lucas Dixon. 2025. LLM Comparator: Interactive Analysis of Side-by-Side Evaluation of Large Language Models. IEEE Transactions on Visualization and Computer Graphics 31 1 (2025) 503\u2013513. 10.1109\/TVCG.2024.3456354","DOI":"10.1109\/TVCG.2024.3456354"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642216"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/2678025.2701399"},{"key":"e_1_3_3_2_22_2","unstructured":"LangChain. 2024. LangGraph Studio. https:\/\/github.com\/langchain-ai\/langgraph-studio. Accessed: 2024-12-02."},{"key":"e_1_3_3_2_23_2","unstructured":"Guohao Li Hasan Abed Al\u00a0Kader Hammoud Hani Itani Dmitrii Khizbullin and Bernard Ghanem. 2023. CAMEL: Communicative Agents for \"Mind\" Exploration of Large Language Model Society. arxiv:https:\/\/arXiv.org\/abs\/2303.17760\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2303.17760"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/1518701.1519023"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/1866029.1866040"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","unstructured":"Nelson\u00a0F. Liu Kevin Lin John Hewitt Ashwin Paranjape Michele Bevilacqua Fabio Petroni and Percy Liang. 2024. Lost in the Middle: How Language Models Use Long Contexts. Transactions of the Association for Computational Linguistics 12 (2024) 157\u2013173. 10.1162\/tacl_a_00638","DOI":"10.1162\/tacl_a_00638"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.173"},{"key":"e_1_3_3_2_28_2","unstructured":"Gregoire Mialon Thomas Scialom Cl\u00e9mentine Fourrier Thomas Wolf and Yann LeCun. 2024. GAIA: A Benchmark for General AI Assistants. https:\/\/ai.meta.com\/research\/publications\/gaia-a-benchmark-for-general-ai-assistants\/"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.10633"},{"key":"e_1_3_3_2_30_2","unstructured":"OpenAI. 2024. Chat Playground. https:\/\/platform.openai.com\/playground\/. Accessed: 2024-09-05."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","unstructured":"Deokgun Park Steven\u00a0M. Drucker Roland Fernandez and Niklas Elmqvist. 2018. Atom: A Grammar for Unit Visualizations. IEEE Transactions on Visualization and Computer Graphics 24 12 (2018) 3032\u20133043. 10.1109\/TVCG.2017.2785807","DOI":"10.1109\/TVCG.2017.2785807"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/1866029.1866038"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640543.3645144"},{"key":"e_1_3_3_2_34_2","unstructured":"Promptfoo. 2024. Promptfoo. https:\/\/www.promptfoo.dev\/. Accessed: 2024-12-02."},{"key":"e_1_3_3_2_35_2","unstructured":"Python Software Foundation. 2024. The Python Debugger (pdb). https:\/\/docs.python.org\/3\/library\/pdb.html. Accessed: 2024-09-05."},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.230"},{"key":"e_1_3_3_2_37_2","unstructured":"D. Sculley Gary Holt Daniel Golovin Eugene Davydov Todd Phillips Dietmar Ebner Vinay Chaudhary and Michael Young. 2014. Machine Learning: The High Interest Credit Card of Technical Debt."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676450"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"crossref","unstructured":"Hendrik Strobelt Albert Webson Victor Sanh Benjamin Hoover Johanna Beyer Hanspeter Pfister and Alexander\u00a0M. Rush. 2022. Interactive and Visual Prompt Engineering for Ad-hoc Task Adaptation with Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2208.07852\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2208.07852","DOI":"10.1109\/TVCG.2022.3209479"},{"key":"e_1_3_3_2_40_2","unstructured":"Ian Tenney Ryan Mullins Bin Du Shree Pandya Minsuk Kahng and Lucas Dixon. 2024. Interactive Prompt Debugging with Sequence Salience. arxiv:https:\/\/arXiv.org\/abs\/2404.07498\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2404.07498"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Sandra Wachter Brent\u00a0D. Mittelstadt and Chris Russell. 2017. Counterfactual Explanations without Opening the Black Box: Automated Decisions and the GDPR. arXiv:https:\/\/arXiv.org\/abs\/1711.00399http:\/\/arxiv.org\/abs\/1711.00399","DOI":"10.2139\/ssrn.3063289"},{"key":"e_1_3_3_2_42_2","unstructured":"Xingyao Wang Boxuan Li Yufan Song Frank\u00a0F. Xu Xiangru Tang Mingchen Zhuge Jiayi Pan Yueqi Song Bowen Li Jaskirat Singh Hoang\u00a0H. Tran Fuqiang Li Ren Ma Mingzhang Zheng Bill Qian Yanjun Shao Niklas Muennighoff Yizhe Zhang Binyuan Hui Junyang Lin Robert Brennan Hao Peng Heng Ji and Graham Neubig. 2024. OpenDevin: An Open Platform for AI Software Developers as Generalist Agents. arxiv:https:\/\/arXiv.org\/abs\/2407.16741\u00a0[cs.SE] https:\/\/arxiv.org\/abs\/2407.16741"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","unstructured":"James Wexler Mahima Pushkarna Tolga Bolukbasi Martin Wattenberg Fernanda\u00a0B. Vi\u00e9gas and Jimbo Wilson. 2020. The What-If Tool: Interactive Probing of Machine Learning Models. IEEE Trans. Vis. Comput. Graph. 26 1 (2020) 56\u201365. 10.1109\/TVCG.2019.2934619","DOI":"10.1109\/TVCG.2019.2934619"},{"key":"e_1_3_3_2_44_2","unstructured":"Qingyun Wu Gagan Bansal Jieyu Zhang Yiran Wu Beibin Li Erkang\u00a0(Eric) Zhu Li Jiang Xiaoyun Zhang Shaokun Zhang Ahmed Awadallah Ryen\u00a0W. White Doug Burger and Chi Wang. 2024. AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation. https:\/\/www.microsoft.com\/en-us\/research\/publication\/autogen-enabling-next-gen-llm-applications-via-multi-agent-conversation-framework\/"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491101.3519729"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.523"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517582"},{"key":"e_1_3_3_2_48_2","unstructured":"Zhiyong Wu Chengcheng Han Zichen Ding Zhenmin Weng Zhoumianze Liu Shunyu Yao Tao Yu and Lingpeng Kong. 2024. OS-Copilot: Towards Generalist Computer Agents with Self-Improvement. arxiv:https:\/\/arXiv.org\/abs\/2402.07456\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2402.07456"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581388"},{"key":"e_1_3_3_2_50_2","unstructured":"Shuyan Zhou Frank\u00a0F. Xu Hao Zhu Xuhui Zhou Robert Lo Abishek Sridhar Xianyi Cheng Tianyue Ou Yonatan Bisk Daniel Fried Uri Alon and Graham Neubig. 2024. WebArena: A Realistic Web Environment for Building Autonomous Agents. https:\/\/openreview.net\/forum?id=oKn9c6ytLx"}],"event":{"name":"CHI 2025: CHI Conference on Human Factors in Computing Systems","location":"Yokohama Japan","acronym":"CHI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3713581","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706598.3713581","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T05:28:47Z","timestamp":1751606927000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3713581"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":49,"alternative-id":["10.1145\/3706598.3713581","10.1145\/3706598"],"URL":"https:\/\/doi.org\/10.1145\/3706598.3713581","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}