{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T13:01:47Z","timestamp":1777899707938,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":103,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"name":"Research Grants Council of the Hong Kong Special Administrative Region, China"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1145\/3691620.3695503","type":"proceedings-article","created":{"date-parts":[[2024,10,18]],"date-time":"2024-10-18T15:39:19Z","timestamp":1729265959000},"page":"1282-1294","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Contextualized Data-Wrangling Code Generation in Computational Notebooks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6962-5292","authenticated-orcid":false,"given":"Junjie","family":"Huang","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0822-1517","authenticated-orcid":false,"given":"Daya","family":"Guo","sequence":"additional","affiliation":[{"name":"Sun-yat Sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5933-6620","authenticated-orcid":false,"given":"Chenglong","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5831-9474","authenticated-orcid":false,"given":"Jiazhen","family":"Gu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7466-2064","authenticated-orcid":false,"given":"Shuai","family":"Lu","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1843-589X","authenticated-orcid":false,"given":"Jeevana Priya","family":"Inala","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3596-4083","authenticated-orcid":false,"given":"Cong","family":"Yan","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2058-9348","authenticated-orcid":false,"given":"Jianfeng","family":"Gao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3387-4674","authenticated-orcid":false,"given":"Nan","family":"Duan","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3666-5798","authenticated-orcid":false,"given":"Michael R.","family":"Lyu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Rajas Agashe Srinivasan Iyer and Luke Zettlemoyer. 2019. JuICe: A Large Scale Distantly Supervised Dataset for Open Domain Context-based Code Generation. In EMNLP-IJCNLP.","DOI":"10.18653\/v1\/D19-1546"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Wasi Ahmad Saikat Chakraborty Baishakhi Ray and Kai-Wei Chang. 2021. Unified Pre-training for Program Understanding and Generation. In NAACLHLT. 2655--2668.","DOI":"10.18653\/v1\/2021.naacl-main.211"},{"key":"e_1_3_2_1_3_1","volume-title":"Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, et al.","author":"Allal Loubna Ben","year":"2023","unstructured":"Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, et al. 2023. SantaCoder: don't reach for the stars! arXiv preprint arXiv:2301.03988 (2023)."},{"key":"e_1_3_2_1_4_1","unstructured":"Jacob Austin Augustus Odena Maxwell Nye Maarten Bosma Henryk Michalewski David Dohan Ellen Jiang Carrie Cai Michael Terry Quoc Le et al. 2021. Program synthesis with large language models. arXiv preprint arXiv:2108.07732 (2021)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2813885.2737952"},{"key":"e_1_3_2_1_6_1","volume-title":"Efficient training of language models to fill in the middle. arXiv preprint arXiv:2207.14255","author":"Bavarian Mohammad","year":"2022","unstructured":"Mohammad Bavarian, Heewoo Jun, Nikolas Tezak, John Schulman, Christine McLeavey, Jerry Tworek, and Mark Chen. 2022. Efficient training of language models to fill in the middle. arXiv preprint arXiv:2207.14255 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Vizsmith: Automated visualization synthesis by mining data-science notebooks. In ASE.","author":"Bavishi Rohan","year":"2021","unstructured":"Rohan Bavishi, Shadaj Laddad, Hiroaki Yoshida, Mukul R Prasad, and Koushik Sen. 2021. Vizsmith: Automated visualization synthesis by mining data-science notebooks. In ASE."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3360594"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.5555\/3379393"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Sumon Biswas Mohammad Wardat and Hridesh Rajan. 2022. The art and practice of data science pipelines: A comprehensive study of data science pipelines in theory in-the-small and in-the-large. In ICSE.","DOI":"10.1145\/3510003.3510057"},{"key":"e_1_3_2_1_11_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. NeurIPS (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"Training and Evaluating a Jupyter Notebook Data Science Assistant. arXiv preprint arXiv:2201.12901","author":"Chandel Shubham","year":"2022","unstructured":"Shubham Chandel, Colin B. Clement, Guillermo Serrato, and Neel Sundaresan. 2022. Training and Evaluating a Jupyter Notebook Data Science Assistant. arXiv preprint arXiv:2201.12901 (2022)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3644385"},{"key":"e_1_3_2_1_14_1","volume-title":"Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.","author":"Chen Mark","year":"2021","unstructured":"Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)."},{"key":"e_1_3_2_1_15_1","volume-title":"Plotcoder: Hierarchical decoding for synthesizing visualization code in programmatic context. In ACL-IJCNLP. 2169--2181.","author":"Chen Xinyun","year":"2021","unstructured":"Xinyun Chen, Linyuan Gong, Alvin Cheung, and Dawn Song. 2021. Plotcoder: Hierarchical decoding for synthesizing visualization code in programmatic context. In ACL-IJCNLP. 2169--2181."},{"key":"e_1_3_2_1_16_1","first-page":"22196","article-title":"Latent execution for neural program synthesis beyond domain-specific languages","volume":"34","author":"Chen Xinyun","year":"2021","unstructured":"Xinyun Chen, Dawn Song, and Yuandong Tian. 2021. Latent execution for neural program synthesis beyond domain-specific languages. Advances in Neural Information Processing Systems 34 (2021), 22196--22208.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","volume-title":"CoWrangler: Recommender System for Data-Wrangling Scripts. In Companion of the 2023 International Conference on Management of Data. 147--150","author":"Chopra Bhavya","year":"2023","unstructured":"Bhavya Chopra, Anna Fariha, Sumit Gulwani, Austin Z Henley, Daniel Perelman, Mohammad Raza, Sherry Shi, Danny Simmons, and Ashish Tiwari. 2023. CoWrangler: Recommender System for Data-Wrangling Scripts. In Companion of the 2023 International Conference on Management of Data. 147--150."},{"key":"e_1_3_2_1_18_1","volume-title":"Todd Mytkowicz, Bo Wang, Jianfeng Gao, and Nan Duan.","author":"Cui Haotian","year":"2022","unstructured":"Haotian Cui, Chenglong Wang, Junjie Huang, Jeevana Priya Inala, Todd Mytkowicz, Bo Wang, Jianfeng Gao, and Nan Duan. 2022. CodeExp: Explanatory Code Document Generation. In Findings of the Association for Computational Linguistics: EMNLP 2022. 2342--2354."},{"key":"e_1_3_2_1_19_1","volume-title":"Mining database structure","author":"Dasu Tamraparni","unstructured":"Tamraparni Dasu, Theodore Johnson, Shanmugauelayut Muthukrishnan, and Vladislav Shkapenyuk. 2002. Mining database structure; or, how to build a data quality browser. In SIGMOD. 240--251."},{"key":"e_1_3_2_1_20_1","volume-title":"Unified language model pretraining for natural language understanding and generation. NeurIPS","author":"Dong Li","year":"2019","unstructured":"Li Dong, Nan Yang, Wenhui Wang, Furu Wei, Xiaodong Liu, Yu Wang, Jianfeng Gao, Ming Zhou, and Hsiao-Wuen Hon. 2019. Unified language model pretraining for natural language understanding and generation. NeurIPS (2019)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Yihong Dong Ge Li and Zhi Jin. 2023. CODEP: grammatical seq2seq model for general-purpose code generation. In ISSTA. 188--198.","DOI":"10.1145\/3597926.3598048"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Zhangyin Feng Daya Guo Duyu Tang Nan Duan Xiaocheng Feng Ming Gong Linjun Shou Bing Qin Ting Liu Daxin Jiang et al. 2020. CodeBERT: A Pre-Trained Model for Programming and Natural Languages. In Findings of the Association for Computational Linguistics: EMNLP. 1536--1547.","DOI":"10.18653\/v1\/2020.findings-emnlp.139"},{"key":"e_1_3_2_1_23_1","unstructured":"Leo Gao Stella Biderman Sid Black Laurence Golding Travis Hoppe Charles Foster Jason Phang Horace He Anish Thite Noa Nabeshima et al. 2020. The pile: An 800gb dataset of diverse text for language modeling. arXiv preprint arXiv:2101.00027 (2020)."},{"key":"e_1_3_2_1_24_1","unstructured":"Shuzheng Gao Xin-Cheng Wen Cuiyun Gao Wenxuan Wang Hongyu Zhang and Michael R Lyu. 2023. What Makes Good In-Context Demonstrations for Code Intelligence Tasks with LLMs?. In ASE."},{"key":"e_1_3_2_1_25_1","unstructured":"GitHub. 2024. GitHub REST API. https:\/\/developer.github.com\/v3"},{"key":"e_1_3_2_1_26_1","unstructured":"Sumit Gulwani. 2016. Programming by Examples - and its applications in Data Wrangling. In Dependable Software Systems Engineering."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2240236.2240260"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Daya Guo Shuai Lu Nan Duan Yanlin Wang Ming Zhou and Jian Yin. 2022. UniXcoder: Unified Cross-Modal Pre-training for Code Representation. In ACL. 7212--7225.","DOI":"10.18653\/v1\/2022.acl-long.499"},{"key":"e_1_3_2_1_29_1","unstructured":"Daya Guo Shuo Ren Shuai Lu Zhangyin Feng Duyu Tang LIU Shujie Long Zhou Nan Duan Alexey Svyatkovskiy Shengyu Fu et al. 2020. GraphCodeBERT: Pre-training Code Representations with Data Flow. In ICLR."},{"key":"e_1_3_2_1_30_1","unstructured":"Daya Guo Qihao Zhu Dejian Yang Zhenda Xie Kai Dong Wentao Zhang Guanting Chen Xiao Bi Y Wu YK Li et al. 2024. DeepSeek-Coder: When the Large Language Model Meets Programming-The Rise of Code Intelligence. arXiv preprint arXiv:2401.14196 (2024)."},{"key":"e_1_3_2_1_31_1","unstructured":"Dan Hendrycks Steven Basart Saurav Kadavath Mantas Mazeika Akul Arora Ethan Guo Collin Burns Samir Puranik Horace He Dawn Song and Jacob Steinhardt. 2021. Measuring Coding Challenge Competence With APPS. In NeurIPS."},{"key":"e_1_3_2_1_32_1","unstructured":"Edward J Hu Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang Weizhu Chen et al. 2023. LoRA: Low-Rank Adaptation of Large Language Models. In ICLR."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Junjie Huang Duyu Tang Linjun Shou Ming Gong Ke Xu Daxin Jiang Ming Zhou and Nan Duan. 2021. CoSQA: 20 000+ Web Queries for Code Search and Question Answering. In ACL-IJCNLP. 5690--5700.","DOI":"10.18653\/v1\/2021.acl-long.442"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the Fourth Workshop on Data Science with Human-in-the-Loop (Language Advances). 28--36","author":"Huang Junjie","year":"2022","unstructured":"Junjie Huang, Chenglong Wang, Jipeng Zhang, Cong Yan, Haotian Cui, Jeevana Priya Inala, Colin B. Clement, Nan Duan, and Jianfeng Gao. 2022. Execution-based Evaluation for Data Science Code Generation Models. In Proceedings of the Fourth Workshop on Data Science with Human-in-the-Loop (Language Advances). 28--36."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.303"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3660818"},{"key":"e_1_3_2_1_37_1","unstructured":"Srinivasan Iyer Ioannis Konstas Alvin Cheung and Luke Zettlemoyer. 2018. Mapping Language to Code in Programmatic Context. In EMNLP. 1643--1652."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3510003.3510203"},{"key":"e_1_3_2_1_39_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al.","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_1_40_1","unstructured":"Kaggle. 2024. Kaggle. https:\/\/www.kaggle.com"},{"key":"e_1_3_2_1_41_1","volume-title":"International conference on machine learning. PMLR, 5110--5121","author":"Kanade Aditya","year":"2020","unstructured":"Aditya Kanade, Petros Maniatis, Gogul Balakrishnan, and Kensen Shi. 2020. Learning and evaluating contextual embedding of source code. In International conference on machine learning. PMLR, 5110--5121."},{"key":"e_1_3_2_1_42_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00778-022-00776-8"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Mary Beth Kery Marissa Radensky Mahima Arya Bonnie E John and Brad A Myers. 2018. The story in the notebook: Exploratory data science using a literate programming tool. In CHI. 1--11.","DOI":"10.1145\/3173574.3173748"},{"key":"e_1_3_2_1_45_1","volume-title":"Code: Harnessing Data for Program Synthesis from Natural Language. arXiv preprint arXiv:2305.01598","author":"Khatry Anirudh","year":"2023","unstructured":"Anirudh Khatry, Joyce Cahoon, Jordan Henkel, Shaleen Deep, Venkatesh Emani, Avrilia Floratou, Sumit Gulwani, Vu Le, Mohammad Raza, Sherry Shi, et al. 2023. From Words to Code: Harnessing Data for Program Synthesis from Natural Language. arXiv preprint arXiv:2305.01598 (2023)."},{"key":"e_1_3_2_1_46_1","unstructured":"Thomas Kluyver Benjamin Ragan-Kelley Fernando P\u00e9rez Brian E. Granger Matthias Bussonnier Jonathan Frederic Kyle Kelley Jessica B. Hamrick Jason Grout Sylvain Corlay Paul Ivanov Dami\u00e1n Avila Safia Abdalla Carol Willing and Jupyter Development Team. 2016. Jupyter Notebooks - a publishing format for reproducible computational workflows. In ELPUB."},{"key":"e_1_3_2_1_47_1","volume-title":"Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, et al.","author":"Kocetkov Denis","year":"2023","unstructured":"Denis Kocetkov, Raymond Li, Loubna Ben Allal, Jia Li, Chenghao Mou, Carlos Mu\u00f1oz Ferrandis, Yacine Jernite, Margaret Mitchell, Sean Hughes, Thomas Wolf, et al. 2023. The stack: 3 tb of permissively licensed source code. Transactions on Machine Learning Research (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Lai Yuhang","year":"2023","unstructured":"Yuhang Lai, Chengxi Li, Yiming Wang, Tianyi Zhang, Ruiqi Zhong, Luke Zettlemoyer, Wen-tau Yih, Daniel Fried, Sida Wang, and Tao Yu. 2023. DS-1000: A natural and reliable benchmark for data science code generation. In International Conference on Machine Learning. PMLR, 18319--18345."},{"key":"e_1_3_2_1_49_1","first-page":"1","article-title":"Deep Learning for Source Code Modeling and Generation","volume":"53","author":"Minh Le Triet Huynh","year":"2020","unstructured":"Triet Huynh Minh Le, Hao Chen, and Muhammad Ali Babar. 2020. Deep Learning for Source Code Modeling and Generation. ACM Computing Surveys (CSUR) 53 (2020), 1--38.","journal-title":"ACM Computing Surveys (CSUR)"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/2594291.2594333"},{"key":"e_1_3_2_1_51_1","volume-title":"BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL. 7871--7880.","author":"Lewis Mike","year":"2020","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Veselin Stoyanov, and Luke Zettlemoyer. 2020. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL. 7871--7880."},{"key":"e_1_3_2_1_52_1","volume-title":"Yangtian Zi, Niklas Muennighoff, Denis Kocetkov, Chenghao Mou, Marc Marone, Christopher Akiki, Jia Li, Jenny Chim, et al.","author":"Li Raymond","year":"2023","unstructured":"Raymond Li, Loubna Ben Allal, Yangtian Zi, Niklas Muennighoff, Denis Kocetkov, Chenghao Mou, Marc Marone, Christopher Akiki, Jia Li, Jenny Chim, et al. 2023. Starcoder: may the source be with you! arXiv preprint arXiv:2305.06161 (2023)."},{"key":"e_1_3_2_1_53_1","first-page":"1","article-title":"Edassistant: Supporting exploratory data analysis in computational notebooks with in situ code search and recommendation","volume":"13","author":"Li Xingjun","year":"2023","unstructured":"Xingjun Li, Yizhi Zhang, Justin Leung, Chengnian Sun, and Jian Zhao. 2023. Edassistant: Supporting exploratory data analysis in computational notebooks with in situ code search and recommendation. ACM Transactions on Interactive Intelligent Systems 13, 1 (2023), 1--27.","journal-title":"ACM Transactions on Interactive Intelligent Systems"},{"key":"e_1_3_2_1_54_1","volume-title":"Exploring the effectiveness of llms in automated logging generation: An empirical study. arXiv preprint arXiv:2307.05950","author":"Li Yichen","year":"2023","unstructured":"Yichen Li, Yintong Huo, Zhihan Jiang, Renyi Zhong, Pinjia He, Yuxin Su, and Michael R Lyu. 2023. Exploring the effectiveness of llms in automated logging generation: An empirical study. arXiv preprint arXiv:2307.05950 (2023)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643754"},{"key":"e_1_3_2_1_56_1","volume-title":"Enhancing LLM-Based Coding Tools through Native Integration of IDE-Derived Static Context. arXiv preprint arXiv:2402.03630","author":"Li Yichen","year":"2024","unstructured":"Yichen Li, Yun Peng, Yintong Huo, and Michael R Lyu. 2024. Enhancing LLM-Based Coding Tools through Native Integration of IDE-Derived Static Context. arXiv preprint arXiv:2402.03630 (2024)."},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC","author":"Lin Xi Victoria","year":"2018","unstructured":"Xi Victoria Lin, Chenglong Wang, Luke Zettlemoyer, and Michael D Ernst. 2018. NL2Bash: A Corpus and Semantic Parser for Natural Language Interface to the Linux Operating System. In Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)."},{"key":"e_1_3_2_1_58_1","volume-title":"Yuyao Wang, and Lingming Zhang.","author":"Liu Jiawei","year":"2024","unstructured":"Jiawei Liu, Chunqiu Steven Xia, Yuyao Wang, and Lingming Zhang. 2024. Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation. NeurIPS (2024)."},{"key":"e_1_3_2_1_59_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In ICLR."},{"key":"e_1_3_2_1_60_1","volume-title":"CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Lu Shuai","year":"2021","unstructured":"Shuai Lu, Daya Guo, Shuo Ren, Junjie Huang, Alexey Svyatkovskiy, Ambrosio Blanco, Colin Clement, Dawn Drain, Daxin Jiang, Duyu Tang, Ge Li, Lidong Zhou, Linjun Shou, Long Zhou, Michele Tufano, MING GONG, Ming Zhou, Nan Duan, Neel Sundaresan, Shao Kun Deng, Shengyu Fu, and Shujie LIU. 2021. CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_2_1_61_1","first-page":"51","article-title":"Data structures for statistical computing in Python","volume":"445","author":"McKinney Wes","year":"2010","unstructured":"Wes McKinney et al. 2010. Data structures for statistical computing in Python.. In SciPy, Vol. 445. 51--56.","journal-title":"SciPy"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Na Meng Miryung Kim and Kathryn S McKinley. 2013. LASE: locating and applying systematic edits by learning from examples. In ICSE.","DOI":"10.1109\/ICSE.2013.6606596"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Tamal Mondal Scott Barnett Akash Lal and Jyothi Vedurada. 2023. Cell2Doc: ML Pipeline for Generating Documentation in Computational Notebooks. In ASE.","DOI":"10.1109\/ASE56229.2023.00200"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3520312.3534866"},{"key":"e_1_3_2_1_65_1","volume-title":"Codegen: An open large language model for code with multi-turn program synthesis. In ICLR.","author":"Nijkamp Erik","year":"2022","unstructured":"Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong. 2022. Codegen: An open large language model for code with multi-turn program synthesis. In ICLR."},{"key":"e_1_3_2_1_66_1","volume-title":"Learning to Generate Pseudo-Code from Source Code Using Statistical Machine Translation. ASE","author":"Oda Yusuke","year":"2015","unstructured":"Yusuke Oda, Hiroyuki Fudaba, Graham Neubig, Hideaki Hata, Sakriani Sakti, Tomoki Toda, and Satoshi Nakamura. 2015. Learning to Generate Pseudo-Code from Source Code Using Statistical Machine Translation. ASE (2015), 574--584."},{"key":"e_1_3_2_1_67_1","unstructured":"OpenAI. 2024. Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_1_68_1","unstructured":"Stack Overflow. 2024. Stack Overflow. https:\/\/stackoverflow.com\/"},{"key":"e_1_3_2_1_69_1","volume-title":"Rishabh Singh, Lihong Li, Dengyong Zhou, and Pushmeet Kohli.","author":"Parisotto Emilio","year":"2017","unstructured":"Emilio Parisotto, Abdel rahman Mohamed, Rishabh Singh, Lihong Li, Dengyong Zhou, and Pushmeet Kohli. 2017. Neuro-Symbolic Program Synthesis. In ICLR."},{"key":"e_1_3_2_1_70_1","unstructured":"Richard E. Pattis Jim Roberts and Mark Stehlik. 1994. Karel the Robot: A Gentle Introduction to the Art of Programming."},{"key":"e_1_3_2_1_71_1","unstructured":"Phind. 2024. Phind\/phind-codellama-34b-v2. https:\/\/huggingface.co\/Phind\/Phind-CodeLlama-34B-v2"},{"key":"e_1_3_2_1_72_1","unstructured":"Illia Polosukhin and Alexander Skidanov. 2018. Neural Program Search: Solving Data Processing Tasks from Description and Examples. In ICLR."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_74_1","volume-title":"Codebleu: a method for automatic evaluation of code synthesis. arXiv preprint arXiv:2009.10297","author":"Ren Shuo","year":"2020","unstructured":"Shuo Ren, Daya Guo, Shuai Lu, Long Zhou, Shujie Liu, Duyu Tang, Neel Sundaresan, Ming Zhou, Ambrosio Blanco, and Shuai Ma. 2020. Codebleu: a method for automatic evaluation of code synthesis. arXiv preprint arXiv:2009.10297 (2020)."},{"key":"e_1_3_2_1_75_1","volume-title":"Yossi Adi, Jingyu Liu, Tal Remez, J\u00e9r\u00e9my Rapin, et al.","author":"Roziere Baptiste","year":"2023","unstructured":"Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, J\u00e9r\u00e9my Rapin, et al. 2023. Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950 (2023)."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"crossref","unstructured":"Adam Rule Aur\u00e9lien Tabard and James D Hollan. 2018. Exploration and explanation in computational notebooks. In CHI. 1--12.","DOI":"10.1145\/3274419"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"crossref","unstructured":"Freda Shi Daniel Fried Marjan Ghazvininejad Luke Zettlemoyer and Sida I Wang. 2022. Natural Language to Code Translation with Execution. In EMNLP.","DOI":"10.18653\/v1\/2022.emnlp-main.231"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3517034"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"crossref","unstructured":"Pavle Suboti\u0107 Lazar Miliki\u0107 and Milan Stoji\u0107. 2022. A static analysis framework for data science notebooks. In ICSE-SEIP. 13--22.","DOI":"10.1145\/3510457.3513032"},{"key":"e_1_3_2_1_80_1","volume-title":"Resplit: improving the structure of jupyter notebooks by re-splitting their cells","author":"Titov Sergey","unstructured":"Sergey Titov, Yaroslav Golubev, and Timofey Bryksin. 2022. Resplit: improving the structure of jupyter notebooks by re-splitting their cells. In SANER. IEEE, 492--496."},{"key":"e_1_3_2_1_81_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_82_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489465"},{"key":"e_1_3_2_1_84_1","volume-title":"Data Formulator: AI-powered Concept-driven Visualization Authoring","author":"Wang Chenglong","year":"2023","unstructured":"Chenglong Wang, John Thompson, and Bongshin Lee. 2023. Data Formulator: AI-powered Concept-driven Visualization Authoring. IEEE Transactions on Visualization and Computer Graphics (2023)."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"crossref","unstructured":"Chaozheng Wang Yuanhang Yang Cuiyun Gao Yun Peng Hongyu Zhang and Michael R Lyu. 2022. No more fine-tuning? an experimental evaluation of prompt tuning in code intelligence. In ESEC\/FSE. 382--394.","DOI":"10.1145\/3540250.3549113"},{"key":"e_1_3_2_1_86_1","volume-title":"AutoDS: Towards Human-Centered Automation of Data Science. CHI","author":"Wang Dakuo","year":"2021","unstructured":"Dakuo Wang, Josh Andres, Justin D. Weisz, Erick Oduor, and Casey Dugan. 2021. AutoDS: Towards Human-Centered Automation of Data Science. CHI (2021)."},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"crossref","unstructured":"Jiawei Wang Tzu-yang Kuo Li Li and Andreas Zeller. 2020. Assessing and restoring reproducibility of Jupyter notebooks. In ASE.","DOI":"10.1145\/3324884.3416585"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"crossref","unstructured":"Jiawei Wang Li Li and Andreas Zeller. 2021. Restoring Execution Environments of Jupyter Notebooks. In ICSE. 1622--1633.","DOI":"10.1109\/ICSE43902.2021.00144"},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"crossref","unstructured":"Shiqi Wang Zheng Li Haifeng Qian Chenghao Yang Zijian Wang Mingyue Shang Varun Kumar Samson Tan Baishakhi Ray Parminder Bhatia et al. 2023. ReCode: Robustness Evaluation of Code Generation Models. In ACL.","DOI":"10.18653\/v1\/2023.acl-long.773"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"crossref","unstructured":"Yue Wang Weishi Wang Shafiq Joty and Steven CH Hoi. 2021. CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation. In EMNLP. 8696--8708.","DOI":"10.18653\/v1\/2021.emnlp-main.685"},{"key":"e_1_3_2_1_91_1","volume-title":"Mconala: a benchmark for code generation from multiple natural languages. arXiv preprint arXiv:2203.08388","author":"Wang Zhiruo","year":"2022","unstructured":"Zhiruo Wang, Grace Cuenca, Shuyan Zhou, Frank F Xu, and Graham Neubig. 2022. Mconala: a benchmark for code generation from multiple natural languages. arXiv preprint arXiv:2203.08388 (2022)."},{"key":"e_1_3_2_1_92_1","volume-title":"Execution-Based Evaluation for Open-Domain Code Generation. arXiv preprint arXiv:2212.10481","author":"Wang Zhiruo","year":"2022","unstructured":"Zhiruo Wang, Shuyan Zhou, Daniel Fried, and Graham Neubig. 2022. Execution-Based Evaluation for Open-Domain Code Generation. arXiv preprint arXiv:2212.10481 (2022)."},{"key":"e_1_3_2_1_93_1","volume-title":"SuperNOVA: Design Strategies and Opportunities for Interactive Visualization in Computational Notebooks. arXiv preprint arXiv:2305.03039","author":"Wang Zijie J","year":"2023","unstructured":"Zijie J Wang, David Munechika, Seongmin Lee, and Duen Horng Chau. 2023. SuperNOVA: Design Strategies and Opportunities for Interactive Visualization in Computational Notebooks. arXiv preprint arXiv:2305.03039 (2023)."},{"key":"e_1_3_2_1_94_1","volume-title":"Exploring parameter-efficient fine-tuning techniques for code generation with large language models. arXiv preprint arXiv:2308.10462","author":"Weyssow Martin","year":"2023","unstructured":"Martin Weyssow, Xin Zhou, Kisub Kim, David Lo, and Houari Sahraoui. 2023. Exploring parameter-efficient fine-tuning techniques for code generation with large language models. arXiv preprint arXiv:2308.10462 (2023)."},{"key":"e_1_3_2_1_95_1","unstructured":"Yifan Wu Joseph M Hellerstein and Arvind Satyanarayan. 2020. B2: Bridging code and interactive visualization in computational notebooks. In UIST. 152--165."},{"key":"e_1_3_2_1_96_1","volume-title":"Auto-suggest: Learning-to-recommend data preparation steps using data science notebooks. In SIGMOD. 1539--1554.","author":"Yan Cong","year":"2020","unstructured":"Cong Yan and Yeye He. 2020. Auto-suggest: Learning-to-recommend data preparation steps using data science notebooks. In SIGMOD. 1539--1554."},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/3196398.3196408"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"crossref","unstructured":"Pengcheng Yin Wen-Ding Li Kefan Xiao Abhishek Rao Yeming Wen Kensen Shi Joshua Howland Paige Bailey Michele Catasta Henryk Michalewski et al. 2023. Natural Language to Code Generation in Interactive Data Science Notebooks. In ACL. 126--173.","DOI":"10.18653\/v1\/2023.acl-long.9"},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"crossref","unstructured":"Pengcheng Yin John Wieting Avirup Sil and Graham Neubig. 2022. On The Ingredients of an Effective Zero-shot Semantic Parser. In ACL. 1455--1474.","DOI":"10.18653\/v1\/2022.acl-long.103"},{"key":"e_1_3_2_1_100_1","volume-title":"Radev","author":"Yu Tao","year":"2018","unstructured":"Tao Yu, Rui Zhang, Kai-Chou Yang, Michihiro Yasunaga, Dongxu Wang, Zifan Li, James Ma, Irene Z Li, Qingning Yao, Shanelle Roman, Zilin Zhang, and Dragomir R. Radev. 2018. Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task. In EMNLP."},{"key":"e_1_3_2_1_101_1","volume-title":"Seq2sql: Generating structured queries from natural language using reinforcement learning. arXiv preprint arXiv:1709.00103","author":"Zhong Victor","year":"2017","unstructured":"Victor Zhong, Caiming Xiong, and Richard Socher. 2017. Seq2sql: Generating structured queries from natural language using reinforcement learning. arXiv preprint arXiv:1709.00103 (2017)."},{"key":"e_1_3_2_1_102_1","volume-title":"DocCoder: Generating Code by Retrieving and Reading Docs. arXiv preprint arXiv:2207.05987","author":"Zhou Shuyan","year":"2022","unstructured":"Shuyan Zhou, Uri Alon, Frank F Xu, Zhengbao Jiang, and Graham Neubig. 2022. DocCoder: Generating Code by Retrieving and Reading Docs. arXiv preprint arXiv:2207.05987 (2022)."},{"key":"e_1_3_2_1_103_1","volume-title":"Xlcost: A benchmark dataset for cross-lingual code intelligence. arXiv preprint arXiv:2206.08474","author":"Zhu Ming","year":"2022","unstructured":"Ming Zhu, Aneesh Jain, Karthik Suresh, Roshan Ravindran, Sindhu Tipirneni, and Chandan K Reddy. 2022. Xlcost: A benchmark dataset for cross-lingual code intelligence. arXiv preprint arXiv:2206.08474 (2022)."}],"event":{"name":"ASE '24: 39th IEEE\/ACM International Conference on Automated Software Engineering","location":"Sacramento CA USA","acronym":"ASE '24","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGSOFT ACM Special Interest Group on Software Engineering","IEEE CS"]},"container-title":["Proceedings of the 39th IEEE\/ACM International Conference on Automated Software Engineering"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3691620.3695503","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3691620.3695503","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:06:19Z","timestamp":1750291579000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3691620.3695503"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":103,"alternative-id":["10.1145\/3691620.3695503","10.1145\/3691620"],"URL":"https:\/\/doi.org\/10.1145\/3691620.3695503","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]},"assertion":[{"value":"2024-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}