{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T20:02:51Z","timestamp":1779825771063,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,31]]},"DOI":"10.1145\/3788853.3803099","type":"proceedings-article","created":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T19:14:47Z","timestamp":1779822887000},"page":"281-292","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ConDABench: Interactive Evaluation of Language Models for Data Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7939-0157","authenticated-orcid":false,"given":"Avik","family":"Dutta","sequence":"first","affiliation":[{"name":"Microsoft, Bangalore, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5599-5004","authenticated-orcid":false,"given":"Priyanshu","family":"Gupta","sequence":"additional","affiliation":[{"name":"Microsoft, Bangalore, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1715-9830","authenticated-orcid":false,"given":"Hosein","family":"Hasanbeig","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5829-0797","authenticated-orcid":false,"given":"Rahul","family":"Pratap Singh","sequence":"additional","affiliation":[{"name":"Microsoft, Bangalore, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0962-3779","authenticated-orcid":false,"given":"Harshit","family":"Nigam","sequence":"additional","affiliation":[{"name":"Microsoft, Bangalore, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9226-9634","authenticated-orcid":false,"given":"Sumit","family":"Gulwani","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5559-5932","authenticated-orcid":false,"given":"Arjun","family":"Radhakrishna","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8061-9000","authenticated-orcid":false,"given":"Gustavo","family":"Soares","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5153-2686","authenticated-orcid":false,"given":"Ashish","family":"Tiwari","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,5,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.401"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664646.3664778"},{"key":"e_1_3_2_1_3_1","first-page":"570","volume-title":"Nature","volume":"624","author":"Boiko Daniil A","year":"2023","unstructured":"Daniil A Boiko, Robert MacKnight, Ben Kline, and Gabe Gomes. 2023. Autonomous chemical research with large language models. Nature, Vol. 624, 7992 (2023), 570-578."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2207676.2208550"},{"key":"e_1_3_2_1_5_1","volume-title":"Tabfact: A large-scale dataset for table-based fact verification. arXiv preprint arXiv:1909.02164","author":"Chen Wenhu","year":"2019","unstructured":"Wenhu Chen, Hongmin Wang, Jianshu Chen, Yunkai Zhang, Hong Wang, Shiyang Li, Xiyou Zhou, and William Yang Wang. 2019. Tabfact: A large-scale dataset for table-based fact verification. arXiv preprint arXiv:1909.02164 (2019)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_7_1","volume-title":"Studies in the Way of Words","author":"Grice Paul","unstructured":"Paul Grice. 1991. Studies in the Way of Words. Harvard University Press."},{"key":"e_1_3_2_1_8_1","volume-title":"Blade: Benchmarking language model agents for data-driven science. arXiv preprint arXiv:2408.09667","author":"Gu Ken","year":"2024","unstructured":"Ken Gu, Ruoxi Shang, Ruien Jiang, Keying Kuang, Richard-John Lin, Donghe Lyu, Yue Mao, Youran Pan, Teng Wu, Jiaqian Yu, et al., 2024. Blade: Benchmarking language model agents for data-driven science. arXiv preprint arXiv:2408.09667 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ebiom.2023.104512"},{"key":"e_1_3_2_1_10_1","unstructured":"Sirui Hong Yizhang Lin Bang Liu Bangbang Liu Binhao Wu Ceyao Zhang Chenxing Wei Danyang Li Jiaqi Chen Jiayi Zhang et al. 2024. Data interpreter: An llm agent for data science. arXiv preprint arXiv:2402.18679 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Infiagent-dabench: Evaluating agents on data analysis tasks. arXiv preprint arXiv:2401.05507","author":"Hu Xueyu","year":"2024","unstructured":"Xueyu Hu, Ziyu Zhao, Shuang Wei, Ziwei Chai, Qianli Ma, Guoyin Wang, Xuwu Wang, Jing Su, Jingjing Xu, Ming Zhu, et al., 2024. Infiagent-dabench: Evaluating agents on data analysis tasks. arXiv preprint arXiv:2401.05507 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"NeurIPS 2023 Foundation Models for Decision Making Workshop.","author":"Huang Qian","year":"2023","unstructured":"Qian Huang, Jian Vora, Percy Liang, and Jure Leskovec. 2023. Benchmarking large language models as AI research agents. In NeurIPS 2023 Foundation Models for Decision Making Workshop."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Yiming Huang Jianwen Luo Yan Yu Yitong Zhang Fangyu Lei Yifan Wei Shizhu He Lifu Huang Xiao Liu Jun Zhao et al. 2024. DA-Code: Agent Data Science Code Generation Benchmark for Large Language Models. arXiv preprint arXiv:2410.07331 (2024).","DOI":"10.18653\/v1\/2024.emnlp-main.748"},{"key":"e_1_3_2_1_14_1","volume-title":"DSBench: How Far Are Data Science Agents to Becoming Data Science Experts? arXiv preprint arXiv:2409.07703","author":"Jing Liqiang","year":"2024","unstructured":"Liqiang Jing, Zhehui Huang, Xiaoyang Wang, Wenlin Yao, Wenhao Yu, Kaixin Ma, Hongming Zhang, Xinya Du, and Dong Yu. 2024. DSBench: How Far Are Data Science Agents to Becoming Data Science Experts? arXiv preprint arXiv:2409.07703 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Kaggle: Your Machine Learning and Data Science Community. https:\/\/www.kaggle.com\/","year":"2024","unstructured":"Kaggle. 2024. Kaggle: Your Machine Learning and Data Science Community. https:\/\/www.kaggle.com\/"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.119"},{"key":"e_1_3_2_1_17_1","unstructured":"Alex Krizhevsky Geoffrey Hinton et al. 2009. Learning multiple layers of features from tiny images. (2009)."},{"key":"e_1_3_2_1_18_1","volume-title":"Tapilot-Crossing: Benchmarking and Evolving LLMs Towards Interactive Data Analysis Agents. arXiv preprint arXiv:2403.05307","author":"Li Jinyang","year":"2024","unstructured":"Jinyang Li, Nan Huo, Yan Gao, Jiayi Shi, Yingxiu Zhao, Ge Qu, Yurong Wu, Chenhao Ma, Jian-Guang Lou, and Reynold Cheng. 2024. Tapilot-Crossing: Benchmarking and Evolving LLMs Towards Interactive Data Analysis Agents. arXiv preprint arXiv:2403.05307 (2024)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.598"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Yang Liu Dan Iter Yichong Xu Shuohang Wang Ruochen Xu and Chenguang Zhu. 2023. G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. arXiv:2303.16634 [cs.CL] https:\/\/arxiv.org\/abs\/2303.16634","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"e_1_3_2_1_21_1","unstructured":"Andrew Low and Z. Yasemin Kalender. 2023. Data Dialogue with ChatGPT: Using Code Interpreter to Simulate and Analyse Experimental Data. arXiv:2311.12415 [physics.ed-ph] https:\/\/arxiv.org\/abs\/2311.12415"},{"key":"e_1_3_2_1_22_1","volume-title":"SpreadsheetBench: Towards Challenging Real World Spreadsheet Manipulation. arXiv preprint arXiv:2406.14991","author":"Ma Zeyao","year":"2024","unstructured":"Zeyao Ma, Bohan Zhang, Jing Zhang, Jifan Yu, Xiaokang Zhang, Xiaohan Zhang, Sijia Luo, Xi Wang, and Jie Tang. 2024. SpreadsheetBench: Towards Challenging Real World Spreadsheet Manipulation. arXiv preprint arXiv:2406.14991 (2024)."},{"key":"e_1_3_2_1_23_1","unstructured":"Friso Kingma Martin Iglesias Alex Egg. 2025. Data Agent Benchmark for Multi-step Reasoning (DABstep). https:\/\/www.adyen.com\/knowledge-hub\/data-agent-benchmark-for-multi-step-reasoning-dabstep"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00446"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.12125"},{"key":"e_1_3_2_1_26_1","unstructured":"Curtis G. Northcutt Anish Athalye and Jonas Mueller. 2021a. Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks. arXiv:2103.14749 [stat.ML] https:\/\/arxiv.org\/abs\/2103.14749"},{"key":"e_1_3_2_1_27_1","unstructured":"OpenAI. 2024. Assistants Api Overview. https:\/\/platform.openai.com\/docs\/assistants"},{"key":"e_1_3_2_1_28_1","volume-title":"Compositional semantic parsing on semi-structured tables. arXiv preprint arXiv:1508.00305","author":"Pasupat Panupong","year":"2015","unstructured":"Panupong Pasupat and Percy Liang. 2015. Compositional semantic parsing on semi-structured tables. arXiv preprint arXiv:1508.00305 (2015)."},{"key":"e_1_3_2_1_29_1","unstructured":"ScienceDirect. 2024. Science Direct: science health and medical research. https:\/\/www.sciencedirect.com\/"},{"key":"e_1_3_2_1_30_1","volume-title":"Tidy Tuesday: A weekly social data project. https:\/\/tidytues.day","year":"2024","unstructured":"TidyTuesday. 2024. Tidy Tuesday: A weekly social data project. https:\/\/tidytues.day"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-41623-1_7"},{"key":"e_1_3_2_1_32_1","unstructured":"Xianjie Wu Jian Yang Linzheng Chai Ge Zhang Jiaheng Liu Xinrun Du Di Liang Daixin Shu Xianfu Cheng Tianzhen Sun et al. 2024. TableBench: A Comprehensive and Complex Benchmark for Table Question Answering. arXiv preprint arXiv:2408.09174 (2024)."},{"key":"e_1_3_2_1_33_1","unstructured":"Shunyu Yao Jeffrey Zhao Dian Yu Nan Du Izhak Shafran Karthik Narasimhan and Yuan Cao. 2023. ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629 [cs.CL] https:\/\/arxiv.org\/abs\/2210.03629"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","unstructured":"Tong Zhang Peixin Qin Yang Deng Chen Huang Wenqiang Lei Junhong Liu Dingnan Jin Hongru Liang and Tat-Seng Chua. 2024. CLAMBER: A Benchmark of Identifying and Clarifying Ambiguous Information Needs in Large Language Models. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) Lun-Wei Ku Andre Martins and Vivek Srikumar (Eds.). Association for Computational Linguistics Bangkok Thailand 10746-10766. https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.578","DOI":"10.18653\/v1\/2024.acl-long.578"},{"key":"e_1_3_2_1_35_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric P. Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. arXiv:2306.05685 [cs.CL] https:\/\/arxiv.org\/abs\/2306.05685"}],"event":{"name":"SIGMOD\/PODS '26: International Conference on Management of Data","location":"Bengaluru India","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Companion of the International Conference on Management of Data"],"original-title":[],"deposited":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T19:17:11Z","timestamp":1779823031000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3788853.3803099"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,30]]},"references-count":35,"alternative-id":["10.1145\/3788853.3803099","10.1145\/3788853"],"URL":"https:\/\/doi.org\/10.1145\/3788853.3803099","relation":{},"subject":[],"published":{"date-parts":[[2026,5,30]]},"assertion":[{"value":"2026-05-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}