{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T13:02:01Z","timestamp":1776085321322,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3772363.3799039","type":"proceedings-article","created":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T01:55:24Z","timestamp":1776045324000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Agent A\/B: Automated and Scalable A\/B Testing on Live Websites with Interactive LLM Agents"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8520-0540","authenticated-orcid":false,"given":"Yuxuan","family":"Lu","sequence":"first","affiliation":[{"name":"Northeastern University, Boston, Massachusetts, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9082-6039","authenticated-orcid":false,"given":"Ting-Yao","family":"Hsu","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, Pennsylvania State University, State College, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1426-3210","authenticated-orcid":false,"given":"Hansu","family":"Gu","sequence":"additional","affiliation":[{"name":"Amazon, Seattle, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5222-3690","authenticated-orcid":false,"given":"Limeng","family":"Cui","sequence":"additional","affiliation":[{"name":"Amazon, Palo Alto, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0320-6728","authenticated-orcid":false,"given":"Yaochen","family":"Xie","sequence":"additional","affiliation":[{"name":"Amazon, Seattle, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6538-1831","authenticated-orcid":false,"suffix":"III","given":"William P.","family":"Headden","sequence":"additional","affiliation":[{"name":"Amazon, Palo Alto, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8329-4610","authenticated-orcid":false,"given":"Bingsheng","family":"Yao","sequence":"additional","affiliation":[{"name":"Northeastern University, Boston, Massachusetts, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8500-0644","authenticated-orcid":false,"given":"Akash","family":"Veeragouni","sequence":"additional","affiliation":[{"name":"Amazon, Seattle, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4530-9754","authenticated-orcid":false,"given":"Jiapeng","family":"Liu","sequence":"additional","affiliation":[{"name":"Amazon, Seattle, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7872-2929","authenticated-orcid":false,"given":"Sreyashi","family":"Nag","sequence":"additional","affiliation":[{"name":"Amazon.com, Amazon, Seattle, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0959-8323","authenticated-orcid":false,"given":"Jessie","family":"Wang","sequence":"additional","affiliation":[{"name":"Amazon, Seattle, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9371-9441","authenticated-orcid":false,"given":"Dakuo","family":"Wang","sequence":"additional","affiliation":[{"name":"Northeastern University, Boston, Massachusetts, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,13]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2514701"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/2047196.2047201"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Patrick Biernacki and Dan Waldorf. 1981. Snowball sampling: Problems and techniques of chain referral sampling. Sociological methods & research 10 2 (1981) 141\u2013163.","DOI":"10.1177\/004912418101000205"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1201\/9780203736166"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642363"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Chaoran Chen Bingsheng Yao Ruishi Zou Wenyue Hua Weimin Lyu Yanfang Ye Toby Jia-Jun Li and Dakuo Wang. 2025. Towards a Design Guideline for RPA Evaluation: A Survey of Large Language Model-Based Role-Playing Agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.13012 (2025).","DOI":"10.18653\/v1\/2025.findings-acl.938"},{"key":"e_1_3_3_2_8_2","unstructured":"Jiaju Chen Yuxuan Lu Xiaojie Wang Huimin Zeng Jing Huang Jiri Gesi Ying Xu Bingsheng Yao and Dakuo Wang. 2025. Multi-Agent-as-Judge: Aligning LLM-Agent-Based Automated Evaluation with Multi-Dimensional Human Evaluation. arxiv:https:\/\/arXiv.org\/abs\/2507.21028\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2507.21028"},{"key":"e_1_3_3_2_9_2","unstructured":"Yiwei Chen Soumyadeep Pal Yimeng Zhang Qing Qu and Sijia Liu. 2025. Unlearning Isn\u2019t Invisible: Detecting Unlearning Traces in LLMs from Model Outputs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.14003 (2025)."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3437963.3441737"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2017.76"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/SEAA.2018.00021"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581273"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Chen Gao Xiaochong Lan Nian Li Yuan Yuan Jingtao Ding Zhilun Zhou Fengli Xu and Yong Li. 2024. Large language models empowered agent-based modeling and simulation: A survey and perspectives. Humanities and Social Sciences Communications 11 1 (2024) 1\u201324.","DOI":"10.1057\/s41599-024-03611-3"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Yingqiang Ge Wenyue Hua Kai Mei Juntao Tan Shuyuan Xu Zelong Li Yongfeng Zhang et\u00a0al. 2023. Openagi: When llm meets domain experts. Advances in Neural Information Processing Systems 36 (2023) 5539\u20135568.","DOI":"10.52202\/075280-0242"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3159652.3159687"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2015.7363863"},{"key":"e_1_3_3_2_18_2","unstructured":"Wayne\u00a0D Gray and Erik\u00a0M Altmann. 2001. Cognitive modeling and human-computer interaction. Karwowski [341] (2001) 387\u2013391."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","unstructured":"Somit Gupta Ronny Kohavi Diane Tang Ya Xu Reid Andersen Eytan Bakshy Niall Cardin Sumitha Chandran Nanyu Chen Dominic Coey et\u00a0al. 2019. Top Challenges from the First Practical Online Controlled Experiments Summit. SIGKDD Explorations 21 1 (2019) 20\u201335. 10.1145\/3331651.3331655","DOI":"10.1145\/3331651.3331655"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/1166253.1166300"},{"key":"e_1_3_3_2_21_2","unstructured":"Hongliang He Wenlin Yao Kaixin Ma Wenhao Yu Yong Dai Hongming Zhang Zhenzhong Lan and Dong Yu. 2024. WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.13919 (2024)."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098184"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.3386\/w31122"},{"key":"e_1_3_3_2_24_2","unstructured":"Wenyue Hua Lizhou Fan Lingyao Li Kai Mei Jianchao Ji Yingqiang Ge Libby Hemphill and Yongfeng Zhang. 2023. War and peace (waragent): Large language model-based multi-agent simulation of world wars. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.17227 (2023)."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706598.3714227"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.245"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3097992"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Bonnie\u00a0E John and David\u00a0E Kieras. 1996. Using GOMS for user interface design and evaluation: which technique? ACM Transactions on Computer-Human Interaction (TOCHI) 3 4 (1996) 287\u2013319.","DOI":"10.1145\/235833.236050"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Antti Kangasr\u00e4\u00e4si\u00f6 and Samuel Kaski. 2018. Inverse reinforcement learning from summary data. Machine Learning 107 8 (2018) 1517\u20131535.","DOI":"10.1007\/s10994-018-5730-4"},{"key":"e_1_3_3_2_30_2","unstructured":"Jing\u00a0Yu Koh Robert Lo Lawrence Jang Vikram Duvvur Ming\u00a0Chong Lim Po-Yu Huang Graham Neubig Shuyan Zhou Ruslan Salakhutdinov and Daniel Fried. 2024. Visualwebarena: Evaluating multimodal agents on realistic visual web tasks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.13649 (2024)."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/2487575.2488217"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/1281192.1281295"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Ron Kohavi and Roger Longbotham. 2015. Online controlled experiments and A\/B tests. Encyclopedia of machine learning and data mining (2015) 1\u201311.","DOI":"10.1007\/978-1-4899-7502-7_891-1"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"Ron Kohavi Roger Longbotham Dan Sommerfield and Randal\u00a0M Henne. 2009. Controlled experiments on the web: survey and practical guide. Data mining and knowledge discovery 18 (2009) 140\u2013181.","DOI":"10.1007\/s10618-008-0114-1"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517647"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/2470654.2466420"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545693"},{"key":"e_1_3_3_2_38_2","unstructured":"Kevin Larsen Nathaniel Stevens and coauthors. 2022. Statistical Challenges in Online Controlled Experiments: A Review of A\/B Testing Methodology. arXiv preprint arXiv:2212.11366 (2022). https:\/\/arxiv.org\/abs\/2212.11366"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702565"},{"key":"e_1_3_3_2_40_2","unstructured":"Jiachen Li Justin Steinberg Xiwen Li Akshat Choube Bingsheng Yao Dakuo Wang Elizabeth Mynatt and Varun Mishra. 2024. Vital Insight: Assisting Experts\u2019 Sensemaking Process of Multi-modal Personal Tracking Data Using Visualization and LLM. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.14879 (2024)."},{"key":"e_1_3_3_2_41_2","unstructured":"Yuxuan Lu Jing Huang Yan Han Bingsheng Yao Sisong Bei Jiri Gesi Yaochen Xie Qi He Dakuo Wang et\u00a0al. 2025. Can LLM Agents Simulate Multi-Turn Human Behavior? Evidence from Real Online Customer Behavior Data. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.20749 (2025)."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491101.3519809"},{"key":"e_1_3_3_2_43_2","unstructured":"Ziyang Luo Can Xu Pu Zhao Qingfeng Sun Xiubo Geng Wenxiang Hu Chongyang Tao Jing Ma Qingwei Lin and Daxin Jiang. 2023. Wizardcoder: Empowering code large language models with evol-instruct. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.08568 (2023)."},{"key":"e_1_3_3_2_44_2","unstructured":"Michael Lutz Arth Bohra Manvel Saroyan Artem Harutyunyan and Giovanni Campagna. 2024. WILBUR: Adaptive In-Context Learning for Robust and Accurate Web Agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.05902 (2024)."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3132847.3132850"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Michael\u00a0J Muller and Sandra Kogan. 2012. Grounded theory method in human-computer interaction and computer-supported cooperative work. The Human Computer Interaction Handbook (3 ed.) Julie A. Jacko (Ed.). CRC Press Boca Raton FL (2012) 1003\u20131024.","DOI":"10.1201\/b11963-51"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.5555\/212925.212982"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"crossref","unstructured":"Jonas Oppenlaender Thanassis Tiropanis and Simo Hosio. 2020. CrowdUI: Supporting web design with the crowd. Proceedings of the ACM on Human-Computer Interaction 4 EICS (2020) 1\u201328.","DOI":"10.1145\/3394978"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/2835776.2835832"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606763"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/2623330.2623334"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-09940-8_13"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3651026"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3358143"},{"key":"e_1_3_3_2_55_2","unstructured":"Ziyi Wang Yuxuan Lu Wenbo Li Amirali Amini Bo Sun Yakov Bart Weimin Lyu Jiri Gesi Tian Wang Jing Huang et\u00a0al. 2025. Opera: A dataset of observation persona rationale and action for evaluating llms on human online shopping behavior simulation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.05606 (2025)."},{"key":"e_1_3_3_2_56_2","unstructured":"Ziyi Wang Yuxuan Lu Yimeng Zhang Jing Huang Jiri Gesi Xianfeng Tang Chen Luo Yisi Sang Hanqing Lu Manling Li et\u00a0al. 2026. Trajectory2Task: Training Robust Tool-Calling Agents with Synthesized Yet Verifiable Data for Complex User Intents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2601.20144 (2026)."},{"key":"e_1_3_3_2_57_2","unstructured":"Ziyi Wang Yuxuan Lu Yimeng Zhang Jing Huang and Dakuo Wang. 2025. Customer-R1: Personalized simulation of human behaviors via RL-based LLM agent in online shopping. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2510.07230 (2025)."},{"key":"e_1_3_3_2_58_2","unstructured":"Qingyun Wu Gagan Bansal Jieyu Zhang Yiran Wu Shaokun Zhang Erkang Zhu Beibin Li Li Jiang Xiaoyun Zhang and Chi Wang. 2023. Autogen: Enabling next-gen llm applications via multi-agent conversation framework. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.08155 (2023)."},{"key":"e_1_3_3_2_59_2","unstructured":"Siyi Wu Feixue Han Bingsheng Yao Tianyi Xie Xuan Zhao and Dakuo Wang. 2024. Sunnie: An Anthropomorphic LLM-Based Conversational Agent for Mental Well-Being Activity Recommendation. arXiv e-prints (2024) arXiv\u20132405."},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/2783258.2788602"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"crossref","unstructured":"Shunyu Yao Howard Chen John Yang and Karthik Narasimhan. 2022. Webshop: Towards scalable real-world web interaction with grounded language agents. Advances in Neural Information Processing Systems 35 (2022) 20744\u201320757.","DOI":"10.52202\/068431-1508"},{"key":"e_1_3_3_2_62_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. 2023. React: Synergizing reasoning and acting in language models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3706599.3719729"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517607"},{"key":"e_1_3_3_2_65_2","unstructured":"Yimeng Zhang Jiri Gesi Ran Xue Tian Wang Ziyi Wang Yuxuan Lu Sinong Zhan Huimin Zeng Qingjun Cui Yufan Guo et\u00a0al. 2025. See Think Act: Online Shopper Behavior Simulation with VLM Agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2510.19245 (2025)."},{"key":"e_1_3_3_2_66_2","unstructured":"Yihua Zhang Pingzhi Li Junyuan Hong Jiaxiang Li Yimeng Zhang Wenqing Zheng Pin-Yu Chen Jason\u00a0D Lee Wotao Yin Mingyi Hong et\u00a0al. 2024. Revisiting zeroth-order optimization for memory-efficient llm fine-tuning: A benchmark. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.11592 (2024)."},{"key":"e_1_3_3_2_67_2","unstructured":"Yimeng Zhang Tian Wang Jiri Gesi Ziyi Wang Yuxuan Lu Jiacheng Lin Sinong Zhan Vianne Gao Ruochen Jiao Junze Liu et\u00a0al. 2025. Shop-r1: Rewarding llms to simulate human behavior in online shopping via reinforcement learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.17842 (2025)."},{"key":"e_1_3_3_2_68_2","unstructured":"Shuyan Zhou Frank\u00a0F Xu Hao Zhu Xuhui Zhou Robert Lo Abishek Sridhar Xianyi Cheng Yonatan Bisk Daniel Fried Uri Alon et\u00a0al. 2023. Webarena: A realistic web environment for building autonomous agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.13854 (2023)."},{"key":"e_1_3_3_2_69_2","volume-title":"AAAI spring symposium: human behavior modeling","author":"Ziebart Brian\u00a0D","year":"2009","unstructured":"Brian\u00a0D Ziebart, Andrew\u00a0L Maas, J\u00a0Andrew Bagnell, and Anind\u00a0K Dey. 2009. Human Behavior Modeling with Maximum Entropy Inverse Optimal Control.. In AAAI spring symposium: human behavior modeling , Vol.\u00a092."}],"event":{"name":"CHI EA '26: Extended Abstracts of the 2026 CHI Conference on Human Factors in Computing Systems","location":"Barcelona , Spain","acronym":"CHI EA '26","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the Extended Abstracts of the 2026 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772363.3799039","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T12:21:32Z","timestamp":1776082892000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772363.3799039"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,13]]},"references-count":68,"alternative-id":["10.1145\/3772363.3799039","10.1145\/3772363"],"URL":"https:\/\/doi.org\/10.1145\/3772363.3799039","relation":{},"subject":[],"published":{"date-parts":[[2026,4,13]]},"assertion":[{"value":"2026-04-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}