{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,26]],"date-time":"2026-06-26T12:03:08Z","timestamp":1782475388531,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T00:00:00Z","timestamp":1779753600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,26]]},"DOI":"10.1145\/3786335.3813162","type":"proceedings-article","created":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T03:16:22Z","timestamp":1779419782000},"page":"497-513","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["ViBench: A Benchmark on Vibe Coding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3614-3139","authenticated-orcid":false,"given":"Peter","family":"Zhong","sequence":"first","affiliation":[{"name":"Replit, Foster City, USA and Carnegie Mellon University, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1340-697X","authenticated-orcid":false,"given":"Pashootan","family":"Vaezipoor","sequence":"additional","affiliation":[{"name":"Georgian AI Lab, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3383-7184","authenticated-orcid":false,"given":"Fuyang","family":"Cui","sequence":"additional","affiliation":[{"name":"Georgian AI Lab, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6517-3933","authenticated-orcid":false,"given":"Vaibhav","family":"Kumar","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5596-1639","authenticated-orcid":false,"given":"James","family":"Austin","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0463-6975","authenticated-orcid":false,"given":"Azin","family":"Asgarian","sequence":"additional","affiliation":[{"name":"Georgian AI Lab, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9978-2924","authenticated-orcid":false,"given":"Toby","family":"Ho","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7961-5412","authenticated-orcid":false,"given":"Paul","family":"Inder","sequence":"additional","affiliation":[{"name":"Georgian AI Lab, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0151-9274","authenticated-orcid":false,"given":"Imen","family":"Kedir","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4399-9925","authenticated-orcid":false,"given":"Zhen","family":"Li","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9749-7348","authenticated-orcid":false,"given":"Nicholas","family":"Ondo","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4207-8252","authenticated-orcid":false,"given":"Asna","family":"Shafiq","sequence":"additional","affiliation":[{"name":"Georgian AI Lab, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3259-990X","authenticated-orcid":false,"given":"Ibrahim","family":"Sheikh","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5304-536X","authenticated-orcid":false,"given":"Edouard","family":"Sioufi","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0995-6654","authenticated-orcid":false,"given":"Setareh","family":"Soltanieh","sequence":"additional","affiliation":[{"name":"Georgian AI Lab, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4055-414X","authenticated-orcid":false,"given":"Ben","family":"Wilde","sequence":"additional","affiliation":[{"name":"Georgian AI Lab, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3554-5885","authenticated-orcid":false,"given":"Jacky","family":"Zhao","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3140-3133","authenticated-orcid":false,"given":"Ryan","family":"Carelli","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2059-5406","authenticated-orcid":false,"given":"Heather","family":"Miller","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0542-0061","authenticated-orcid":false,"given":"Michele","family":"Catasta","sequence":"additional","affiliation":[{"name":"Replit, Foster City, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,5,26]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"Anthropic. 2024. Introducing computer use a new Claude 3.5 Sonnet and Claude 3.5 Haiku. https:\/\/www.anthropic.com\/news\/3-5-models-and-computer-use."},{"key":"e_1_3_3_3_3_2","unstructured":"Anthropic. 2025. Text Editor Tool. https:\/\/platform.claude.com\/docs\/en\/agents-and-tools\/tool-use\/text-editor-tool."},{"key":"e_1_3_3_3_4_2","unstructured":"AppBench Team. 2025. AppBench: Evaluating AI Agents on Application Development. https:\/\/appbench.ai\/."},{"key":"e_1_3_3_3_5_2","unstructured":"Jacob Austin Augustus Odena Maxwell Nye Maarten Bosma Henryk Michalewski David Dohan Ellen Jiang Carrie Cai Michael Terry Quoc Le and Charles Sutton. 2021. Program Synthesis with Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.07732 (2021). https:\/\/arxiv.org\/abs\/2108.07732"},{"key":"e_1_3_3_3_6_2","unstructured":"Anonymous Authors. 2025. CodeClash: Benchmarking Goal-Oriented Software Engineering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.00839 (2025)."},{"key":"e_1_3_3_3_7_2","unstructured":"Yutong Bian Xianhao Lin Yupeng Xie Tianyang Liu Mingchen Zhuge Siyuan Lu Haoming Tang Jinlin Wang Jiayi Zhang Jiaqi Chen Xiangru Tang Yongxin Ni Sirui Hong and Chenglin Wu. 2025. You Don\u2019t Know Until You Click: Automated GUI Testing for Production-Ready Software Evaluation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.14104 (2025). https:\/\/arxiv.org\/abs\/2508.14104"},{"key":"e_1_3_3_3_8_2","unstructured":"Browser-Use. 2025. Make websites accessible for AI agents. https:\/\/github.com\/browser-use\/browser-use."},{"key":"e_1_3_3_3_9_2","unstructured":"Mert Cemri Melissa\u00a0Z. Pan Shuyi Yang Lakshya\u00a0A. Agrawal Bhavya Chopra Rishabh Tiwari Kurt Keutzer Aditya Parameswaran Dan Klein Kannan Ramchandran Matei Zaharia Joseph\u00a0E. Gonzalez and Ion Stoica. 2025. Why Do Multi-Agent LLM Systems Fail? arxiv:https:\/\/arXiv.org\/abs\/2503.13657\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2503.13657"},{"key":"e_1_3_3_3_10_2","unstructured":"Haorui Chen Chengze Li and Jia Li. 2025. FeatBench: Evaluating Coding Agents on Feature Implementation for Vibe Coding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.22237 (2025). https:\/\/arxiv.org\/abs\/2509.22237"},{"key":"e_1_3_3_3_11_2","unstructured":"Mark Chen Jerry Tworek Heewoo Jun Qiming Yuan Henrique\u00a0Ponde de Oliveira\u00a0Pinto Jared Kaplan Harri Edwards Yuri Burda Nicholas Joseph Greg Brockman Alex Ray Raul Puri Gretchen Krueger Michael Petrov Heidy Khlaaf Girish Sastry Pamela Mishkin Brooke Chan Scott Gray Nick Ryder Mikhail Pavlov Alethea Power Lukasz Kaiser Mohammad Bavarian Clemens Winter Philippe Tillet Felipe\u00a0Petroski Such Dave Cummings Matthias Plappert Fotios Chantzis Elizabeth Barnes Ariel Herbert-Voss William\u00a0Hebgen Guss Alex Nichol Alex Paino Nikolas Tezak Jie Tang Igor Babuschkin Suchir Balaji Shantanu Jain William Saunders Christopher Hesse Andrew\u00a0N. Carr Jan Leike Josh Achiam Vedant Misra Evan Morikawa Alec Radford Matthew Knight Miles Brundage Mira Murati Katie Mayer Peter Welinder Bob McGrew Dario Amodei Sam McCandlish Ilya Sutskever and Wojciech Zaremba. 2021. Evaluating Large Language Models Trained on Code. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2107.03374 (2021). https:\/\/github.com\/openai\/human-eval"},{"key":"e_1_3_3_3_12_2","unstructured":"Chrome DevTools. 2024. Chrome DevTools MCP. https:\/\/github.com\/ChromeDevTools\/chrome-devtools-mcp."},{"key":"e_1_3_3_3_13_2","unstructured":"Xiang Deng Jeff Da Edwin Pan Yannis\u00a0Yiming He Charles Ide Kanak Garg Niklas Lauffer Andrew Park Nitin Pasari Chetan Rane Karmini Sampath Maya Krishnan Srivatsa Kundurthy Sean Hendryx Zifan Wang Vijay Bharadwaj Jeff Holm Raja Aluri Chen Bo\u00a0Calvin Zhang Noah Jacobson Bing Liu and Brad Kenstler. 2025. SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.16941 (2025). https:\/\/arxiv.org\/abs\/2509.16941"},{"key":"e_1_3_3_3_14_2","unstructured":"Xueying Du Mingwei Liu Kaixin Wang Hanlin Wang Junwei Liu Yixuan Chen Jiayi Feng Chaofeng Sha Xin Peng and Yiling Lou. 2023. ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-Level Code Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.01861 (2023). https:\/\/arxiv.org\/abs\/2308.01861"},{"key":"e_1_3_3_3_15_2","unstructured":"Google. 2017. Google Colaboratory. https:\/\/colab.research.google.com\/."},{"key":"e_1_3_3_3_16_2","unstructured":"Fabian Hertwig. 2025. Code Surgery: How AI Assistants Make Precise Edits to Your Files. https:\/\/fabianhertwig.com\/blog\/coding-assistants-file-edits\/."},{"key":"e_1_3_3_3_17_2","volume-title":"Proceedings of the Twelfth International Conference on Learning Representations (ICLR)","author":"Jimenez Carlos\u00a0E.","year":"2024","unstructured":"Carlos\u00a0E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan. 2024. SWE-bench: Can Language Models Resolve Real-World GitHub Issues?. In Proceedings of the Twelfth International Conference on Learning Representations (ICLR). https:\/\/www.swebench.com\/"},{"key":"e_1_3_3_3_18_2","unstructured":"Andrej Karpathy. 2025. There\u2019s a new kind of coding I call \u201cvibe coding\u201d. X (formerly Twitter). https:\/\/x.com\/karpathy\/status\/1886192184808149383 Accessed: 2025-02-01."},{"key":"e_1_3_3_3_19_2","first-page":"87","volume-title":"Positioning and Power in Academic Publishing: Players, Agents and Agendas","author":"Kluyver Thomas","year":"2016","unstructured":"Thomas Kluyver, Benjamin Ragan-Kelley, Fernando P\u00e9rez, Brian\u00a0E. Granger, Matthias Bussonnier, Jonathan Frederic, Kyle Kelley, Jessica\u00a0B. Hamrick, Jason Grout, Sylvain Corlay, Paul Ivanov, Dami\u00e1n Avila, Safia Abdalla, Carol Willing, and Jupyter Development Team. 2016. Jupyter Notebooks \u2013 a publishing format for reproducible computational workflows. In Positioning and Power in Academic Publishing: Players, Agents and Agendas. 87\u201390. https:\/\/jupyter.org\/"},{"key":"e_1_3_3_3_20_2","unstructured":"Tianyang Liu Canwen Xu and Julian McAuley. 2023. RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.03091 (2023)."},{"key":"e_1_3_3_3_21_2","unstructured":"Lovable. 2025. AI-powered full-stack app builder. https:\/\/lovable.dev\/."},{"key":"e_1_3_3_3_22_2","unstructured":"Jeffrey\u00a0Jian Ma Milad Hashemi Amir Yazdanbakhsh Kevin Swersky Ofir Press Enhui Li Vijay\u00a0Janapa Reddi and Parthasarathy Ranganathan. 2025. SWE-fficiency: Can Language Models Optimize Real-World Repositories on Real Workloads? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.06090 (2025). https:\/\/arxiv.org\/abs\/2511.06090"},{"key":"e_1_3_3_3_23_2","unstructured":"Kevin Meng Vincent Huang Jacob Steinhardt and Sarah Schwettmann. 2025. Introducing Docent. https:\/\/transluce.org\/introducing-docent."},{"key":"e_1_3_3_3_24_2","unstructured":"Microsoft. 2024. ARIA Snapshots \u2013 Playwright Documentation. https:\/\/playwright.dev\/docs\/aria-snapshots."},{"key":"e_1_3_3_3_25_2","unstructured":"Microsoft. 2024. Playwright \u2013 Fast and reliable end-to-end testing for modern web apps. https:\/\/playwright.dev\/."},{"key":"e_1_3_3_3_26_2","unstructured":"Microsoft. 2024. Playwright MCP. https:\/\/github.com\/microsoft\/playwright-mcp."},{"key":"e_1_3_3_3_27_2","unstructured":"Samuel Miserendino et\u00a0al. 2025. SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering? OpenAI. https:\/\/openai.com\/index\/swe-lancer\/"},{"key":"e_1_3_3_3_28_2","unstructured":"OpenAI. 2024. Introducing SWE-bench Verified. OpenAI Blog. https:\/\/openai.com\/index\/introducing-swe-bench-verified\/"},{"key":"e_1_3_3_3_29_2","unstructured":"OpenAI. 2025. Apply Patch Tool. https:\/\/platform.openai.com\/docs\/guides\/tools-apply-patch."},{"key":"e_1_3_3_3_30_2","unstructured":"OpenAI. 2025. Computer-Using Agent. https:\/\/openai.com\/index\/computer-using-agent\/."},{"key":"e_1_3_3_3_31_2","unstructured":"Replit. 2025. Introducing Replit Agent. https:\/\/blog.replit.com\/introducing-replit-agent."},{"key":"e_1_3_3_3_32_2","unstructured":"Jakob Steinschaden. 2025. Vibe coding startups: valuations grew by 350% in one year huge revenue multiples. Trending Topics. https:\/\/www.trendingtopics.eu\/vibe-coding-startups-valuations-grew-by-350-in-one-year-huge-revenue-multiples\/ Accessed: 2025-09-11."},{"key":"e_1_3_3_3_33_2","unstructured":"The\u00a0Cline Team. 2025. cline-bench: A Real-World Open Source Benchmark for Agentic Coding. https:\/\/cline.bot\/blog\/cline-bench-initiative."},{"key":"e_1_3_3_3_34_2","unstructured":"The Terminal-Bench Team. 2025. Terminal-Bench: A Benchmark for AI Agents in Terminal Environments. https:\/\/github.com\/laude-institute\/terminal-bench."},{"key":"e_1_3_3_3_35_2","unstructured":"Shengbang Tong Zhuang Liu Yuexiang Zhai Yi Ma Yann LeCun and Saining Xie. 2024. Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.06209 (2024). https:\/\/arxiv.org\/abs\/2401.06209"},{"key":"e_1_3_3_3_36_2","unstructured":"Hung Tran Langston Nashold Rayan Krishnan Antoine Bigeard and Alex Gu. 2025. Vibe Code Bench: Can AI Models Build Web Applications from Scratch? Vals AI. https:\/\/vals.ai\/benchmarks\/vibe-code Accessed: 2025-11-21."},{"key":"e_1_3_3_3_37_2","unstructured":"Vercel. 2024. v0 \u2013 AI-powered UI generation. https:\/\/v0.dev\/."},{"key":"e_1_3_3_3_38_2","unstructured":"Xingyao Wang Boxuan Li Yufan Song Frank\u00a0F. Xu Xiangru Tang Mingchen Zhuge Jiayi Pan Yueqi Song Bowen Li Jaskirat Singh Hoang\u00a0H. Tran Fuqiang Li Ren Ma Mingzhang Zheng Bill Qian Yanjun Shao Niklas Muennighoff Yizhe Zhang Binyuan Hui Junyang Lin Robert Brennan Hao Peng Heng Ji and Graham Neubig. 2024. OpenHands: An Open Platform for AI Software Developers as Generalist Agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.16741 (2024). https:\/\/github.com\/All-Hands-AI\/OpenHands"},{"key":"e_1_3_3_3_39_2","unstructured":"Blair Yang Fuyang Cui Keiran Paster Jimmy Ba Pashootan Vaezipoor Silviu Pitis and Michael\u00a0R. Zhang. 2024. Report Cards: Qualitative Evaluation of Language Models Using Natural Language Summaries. arxiv:https:\/\/arXiv.org\/abs\/2409.00844\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2409.00844"},{"key":"e_1_3_3_3_40_2","volume-title":"Proceedings of the 13th International Conference on Learning Representations (ICLR)","author":"Yang John","year":"2025","unstructured":"John Yang et\u00a0al. 2025. SWE-bench Multimodal: Do AI Systems Generalize to Visual Software Domains?. In Proceedings of the 13th International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_3_41_2","unstructured":"Shunyu Yao Noah Shinn Pedram Razavi and Karthik Narasimhan. 2024. \u03c4 -bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.12045 (2024). https:\/\/arxiv.org\/abs\/2406.12045"},{"key":"e_1_3_3_3_42_2","unstructured":"Shunyu Yao Jeffrey Zhao Dian Yu Nan Du Izhak Shafran Karthik Narasimhan and Yuan Cao. 2023. ReAct: Synergizing Reasoning and Acting in Language Models. arxiv:https:\/\/arXiv.org\/abs\/2210.03629\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2210.03629"},{"key":"e_1_3_3_3_43_2","unstructured":"Linghao Zhang Shilin He Chaoyun Zhang Yu Kang Bowen Li Chengxing Xie Junhao Wang Maoquan Wang Yufan Huang Shengyu Fu Elsie Nallipogu Qingwei Lin Yingnong Dang Saravan Rajmohan and Dongmei Zhang. 2025. SWE-bench Goes Live! arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.23419 (2025). https:\/\/arxiv.org\/abs\/2505.23419"},{"key":"e_1_3_3_3_44_2","unstructured":"Hongda Zhu Yiwen Zhang Bing Zhao Jingzhe Ding Siyao Liu Tong Liu Dandan Wang Yanan Liu and Zhaojian Li. 2025. FrontendBench: A Benchmark for Evaluating LLMs on Front-End Development via Automatic Evaluation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.13832 (2025). https:\/\/arxiv.org\/abs\/2506.13832"}],"event":{"name":"CAIS '26: ACM Conference on AI and Agentic Systems","location":"San Jose CA USA","acronym":"CAIS '26"},"container-title":["Proceedings of the ACM Conference on AI and Agentic Systems"],"original-title":[],"deposited":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T03:24:45Z","timestamp":1779420285000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3786335.3813162"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,26]]},"references-count":43,"alternative-id":["10.1145\/3786335.3813162","10.1145\/3786335"],"URL":"https:\/\/doi.org\/10.1145\/3786335.3813162","relation":{},"subject":[],"published":{"date-parts":[[2026,5,26]]},"assertion":[{"value":"2026-05-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}