{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,22]],"date-time":"2026-07-22T04:48:44Z","timestamp":1784695724637,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,15]],"date-time":"2024-04-15T00:00:00Z","timestamp":1713139200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"name":"German Federal Ministry of Education and Research","award":["BIFOLD24B"],"award-info":[{"award-number":["BIFOLD24B"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,15]]},"DOI":"10.1145\/3643991.3644923","type":"proceedings-article","created":{"date-parts":[[2024,8,6]],"date-time":"2024-08-06T21:19:25Z","timestamp":1722979165000},"page":"444-456","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Data Augmentation for Supervised Code Translation Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-7153-6100","authenticated-orcid":false,"given":"Binger","family":"Chen","sequence":"first","affiliation":[{"name":"Technische Universit\u00e4t Berlin, Berlin, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8053-8318","authenticated-orcid":false,"given":"Jacek","family":"Golebiowski","sequence":"additional","affiliation":[{"name":"Amazon AWS, Berlin, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2846-1373","authenticated-orcid":false,"given":"Ziawasch","family":"Abedjan","sequence":"additional","affiliation":[{"name":"Leibniz Universit\u00e4t Hannover, Hanover, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,7,2]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/www.antlr.org\/. [Online","author":"ANTLR.","year":"2023","unstructured":"2019. ANTLR. https:\/\/www.antlr.org\/. [Online; accessed 28-Apr-2023]."},{"key":"e_1_3_2_1_2_1","volume-title":"https:\/\/sourceforge.net\/projects\/j2cstranslator\/. [Online","year":"2023","unstructured":"2019. Java2CSharp. https:\/\/sourceforge.net\/projects\/j2cstranslator\/. [Online; accessed 28-Apr-2023]."},{"key":"e_1_3_2_1_3_1","volume-title":"Public Git Archive. https:\/\/github.com\/src-d\/datasets\/tree\/master\/PublicGitArchive. [Online","year":"2023","unstructured":"2019. Public Git Archive. https:\/\/github.com\/src-d\/datasets\/tree\/master\/PublicGitArchive. [Online; accessed 28-Apr-2023]."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.449"},{"key":"e_1_3_2_1_5_1","volume-title":"Self-Supervised Bug Detection and Repair. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021","author":"Allamanis Miltiadis","year":"2021","unstructured":"Miltiadis Allamanis, Henry Jackson-Flux, and Marc Brockschmidt. 2021. Self-Supervised Bug Detection and Repair. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual. 27865--27876."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 33nd International Conference on Machine Learning, ICML 2016, New York City, NY, USA, June 19-24, 2016 (JMLR Workshop and Conference Proceedings","volume":"182","author":"Amodei Dario","year":"2016","unstructured":"Dario Amodei, Sundaram Ananthanarayanan, Rishita Anubhai, Jingliang Bai, Eric Battenberg, Carl Case, Jared Casper, Bryan Catanzaro, Jingdong Chen, Mike Chrzanowski, Adam Coates, Greg Diamos, Erich Elsen, Jesse H. Engel, Linxi Fan, Christopher Fougner, Awni Y. Hannun, Billy Jun, Tony Han, Patrick LeGresley, Xiangang Li, Libby Lin, Sharan Narang, Andrew Y. Ng, Sherjil Ozair, Ryan Prenger, Sheng Qian, Jonathan Raiman, Sanjeev Satheesh, David Seetapun, Shubho Sengupta, Chong Wang, Yi Wang, Zhiqian Wang, Bo Xiao, Yan Xie, Dani Yogatama, Jun Zhan, and Zhenyao Zhu. 2016. Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin. In Proceedings of the 33nd International Conference on Machine Learning, ICML 2016, New York City, NY, USA, June 19-24, 2016 (JMLR Workshop and Conference Proceedings, Vol. 48). JMLR.org, 173--182."},{"key":"e_1_3_2_1_7_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual."},{"key":"e_1_3_2_1_8_1","volume-title":"Interactive Cross-language Code Retrieval with Auto-Encoders. In 36th IEEE\/ACM International Conference on Automated Software Engineering, ASE 2021","author":"Chen Binger","year":"2021","unstructured":"Binger Chen and Ziawasch Abedjan. 2021. Interactive Cross-language Code Retrieval with Auto-Encoders. In 36th IEEE\/ACM International Conference on Automated Software Engineering, ASE 2021, Melbourne, Australia, November 15-19, 2021. IEEE, 167--178."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-Companion52605.2021.00117"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00198"},{"key":"e_1_3_2_1_11_1","unstructured":"Xinyun Chen Chang Liu and Dawn Song. 2018. Tree-to-tree neural networks for program translation. In Advances in Neural Information Processing Systems. 2547--2557."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.529"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-4714"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1269"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT). Association for Computational Linguistics.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT). Association for Computational Linguistics."},{"key":"e_1_3_2_1_16_1","volume-title":"Taylor","author":"Devries Terrance","year":"2017","unstructured":"Terrance Devries and Graham W. Taylor. 2017. Improved Regularization of Convolutional Neural Networks with Cutout. CoRR abs\/1708.04552 (2017)."},{"key":"e_1_3_2_1_17_1","volume-title":"Syntax-aware Data Augmentation for Neural Machine Translation. CoRR abs\/2004.14200","author":"Duan Sufeng","year":"2020","unstructured":"Sufeng Duan, Hai Zhao, Dongdong Zhang, and Rui Wang. 2020. Syntax-aware Data Augmentation for Neural Machine Translation. CoRR abs\/2004.14200 (2020)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-2090"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.84"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1555"},{"key":"e_1_3_2_1_21_1","volume-title":"Stupid Bugs. In 20th IEEE\/ACM International Conference on Mining Software Repositories, MSR 2023","author":"Jesse Kevin","year":"2023","unstructured":"Kevin Jesse, Toufique Ahmed, Premkumar T. Devanbu, and Emily Morgan. 2023. Large Language Models and Simple, Stupid Bugs. In 20th IEEE\/ACM International Conference on Mining Software Repositories, MSR 2023, Melbourne, Australia, May 15-16, 2023. IEEE, 563--575."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2010.62"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3180155.3180187"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1549"},{"key":"e_1_3_2_1_25_1","first-page":"255","article-title":"A Diverse Data Augmentation Strategy for Low-Resource Neural Machine","volume":"11","author":"Li Yu","year":"2020","unstructured":"Yu Li, Xiao Li, Yating Yang, and Rui Dong. 2020. A Diverse Data Augmentation Strategy for Low-Resource Neural Machine Translation. Inf. 11, 5 (2020), 255.","journal-title":"Translation. Inf."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-5543"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10618-008-0118-x"},{"key":"e_1_3_2_1_28_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Liu Shangqing","year":"2021","unstructured":"Shangqing Liu, Yu Chen, Xiaofei Xie, Jing Kai Siow, and Yang Liu. 2021. Retrieval-Augmented Generation for Code Summarization via Hybrid GNN. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/SANER.2018.8330202"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks 1, NeurIPS Datasets and Benchmarks 2021","author":"Lu Shuai","year":"2021","unstructured":"Shuai Lu, Daya Guo, Shuo Ren, Junjie Huang, Alexey Svyatkovskiy, Ambrosio Blanco, Colin B. Clement, Dawn Drain, Daxin Jiang, Duyu Tang, Ge Li, Lidong Zhou, Linjun Shou, Long Zhou, Michele Tufano, Ming Gong, Ming Zhou, Nan Duan, Neel Sundaresan, Shao Kun Deng, Shengyu Fu, and Shujie Liu. 2021. CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation. In Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks 1, NeurIPS Datasets and Benchmarks 2021, December 2021, virtual."},{"key":"e_1_3_2_1_31_1","unstructured":"Edward Ma. 2019. NLP Augmentation. https:\/\/github.com\/makcedward\/nlpaug."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3196398.3196464"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3468264.3468538"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491411.2494584"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2015.74"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSME.2016.89"},{"key":"e_1_3_2_1_37_1","volume-title":"Data Diversification: A Simple Strategy For Neural Machine Translation. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Nguyen Xuan-Phi","year":"2020","unstructured":"Xuan-Phi Nguyen, Shafiq R. Joty, Kui Wu, and Ai Ti Aw. 2020. Data Diversification: A Simple Strategy For Neural Machine Translation. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual."},{"key":"e_1_3_2_1_38_1","unstructured":"Ansong Ni Pengcheng Yin Yilun Zhao Martin Riddell Troy Feng Rui Shen Stephen Yin Ye Liu Semih Yavuz Caiming Xiong et al. 2023. L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models. arXiv preprint arXiv:2309.17446 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the 30th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering, ESEC\/FSE 2022","author":"Orvalho Pedro","year":"2022","unstructured":"Pedro Orvalho, Mikol\u00e1s Janota, and Vasco M. Manquinho. 2022. MultIPAs: applying program transformations to introductory programming assignments for data augmentation. In Proceedings of the 30th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering, ESEC\/FSE 2022, Singapore, Singapore, November 14-18, 2022. ACM, 1657--1661."},{"key":"e_1_3_2_1_40_1","volume-title":"Rahul Krishna, Divya Sankar, Lambert Pouguem Wassi, Michele Merler, Boris Sobolev, Raju Pavuluri, Saurabh Sinha, and Reyhaneh Jabbarvand.","author":"Pan Rangeet","year":"2023","unstructured":"Rangeet Pan, Ali Reza Ibrahimzada, Rahul Krishna, Divya Sankar, Lambert Pouguem Wassi, Michele Merler, Boris Sobolev, Raju Pavuluri, Saurabh Sinha, and Reyhaneh Jabbarvand. 2023. Understanding the Effectiveness of Large Language Models in Code Translation. CoRR abs\/2308.03109 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3385412.3386001"},{"key":"e_1_3_2_1_42_1","volume-title":"CodeBLEU: a Method for Automatic Evaluation of Code Synthesis. CoRR abs\/2009.10297","author":"Ren Shuo","year":"2020","unstructured":"Shuo Ren, Daya Guo, Shuai Lu, Long Zhou, Shujie Liu, Duyu Tang, Neel Sundaresan, Ming Zhou, Ambrosio Blanco, and Shuai Ma. 2020. CodeBLEU: a Method for Automatic Evaluation of Code Synthesis. CoRR abs\/2009.10297 (2020)."},{"key":"e_1_3_2_1_43_1","volume-title":"Unsupervised Translation of Programming Languages. In Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Rozi\u00e8re Baptiste","year":"2020","unstructured":"Baptiste Rozi\u00e8re, Marie-Anne Lachaux, Lowik Chanussot, and Guillaume Lample. 2020. Unsupervised Translation of Programming Languages. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_44_1","volume-title":"Leveraging Automated Unit Tests for Unsupervised Code Translation. In The Tenth International Conference on Learning Representations, ICLR 2022","author":"Rozi\u00e8re Baptiste","year":"2022","unstructured":"Baptiste Rozi\u00e8re, Jie Zhang, Fran\u00e7ois Charton, Mark Harman, Gabriel Synnaeve, and Guillaume Lample. 2022. Leveraging Automated Unit Tests for Unsupervised Code Translation. In The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022. OpenReview.net."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1009"},{"key":"e_1_3_2_1_46_1","volume-title":"On the Importance of Building High-quality Training Datasets for Neural Code Search. In 44th IEEE\/ACM 44th International Conference on Software Engineering, ICSE 2022","author":"Sun Zhensu","year":"2022","unstructured":"Zhensu Sun, Li Li, Yan Liu, Xiaoning Du, and Li Li. 2022. On the Importance of Building High-quality Training Datasets for Neural Code Search. In 44th IEEE\/ACM 44th International Conference on Software Engineering, ICSE 2022, Pittsburgh, PA, USA, May 25-27, 2022. ACM, 1609--1620."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019","volume":"1","author":"Singh Sumeet","year":"2019","unstructured":"Vaibhav, Sumeet Singh, Craig Stewart, and Graham Neubig. 2019. Improving Robustness of Machine Translation with Synthetic Noise. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long and Short Papers). Association for Computational Linguistics, 1916--1920."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1100"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2022.111304"},{"key":"e_1_3_2_1_50_1","volume-title":"5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net.","author":"Zhang Chiyuan","year":"2017","unstructured":"Chiyuan Zhang, Samy Bengio, Moritz Hardt, Benjamin Recht, and Oriol Vinyals. 2017. Understanding deep learning requires rethinking generalization. In 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net."},{"key":"e_1_3_2_1_51_1","volume-title":"A Study on Robustness and Reliability of Large Language Model Code Generation. CoRR abs\/2308.10335","author":"Zhong Li","year":"2023","unstructured":"Li Zhong and Zilong Wang. 2023. A Study on Robustness and Reliability of Large Language Model Code Generation. CoRR abs\/2308.10335 (2023)."}],"event":{"name":"MSR '24: 21st International Conference on Mining Software Repositories","location":"Lisbon Portugal","acronym":"MSR '24","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","IEEE CS"]},"container-title":["Proceedings of the 21st International Conference on Mining Software Repositories"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3643991.3644923","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3643991.3644923","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:56:44Z","timestamp":1750291004000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3643991.3644923"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,15]]},"references-count":51,"alternative-id":["10.1145\/3643991.3644923","10.1145\/3643991"],"URL":"https:\/\/doi.org\/10.1145\/3643991.3644923","relation":{},"subject":[],"published":{"date-parts":[[2024,4,15]]},"assertion":[{"value":"2024-07-02","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}