{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,26]],"date-time":"2025-12-26T07:11:46Z","timestamp":1766733106482,"version":"3.40.3"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031754333"},{"type":"electronic","value":"9783031754340"}],"license":[{"start":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T00:00:00Z","timestamp":1735516800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T00:00:00Z","timestamp":1735516800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-75434-0_8","type":"book-chapter","created":{"date-parts":[[2024,12,29]],"date-time":"2024-12-29T09:24:14Z","timestamp":1735464254000},"page":"107-124","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["SEGym: Optimizing Large Language Model Assisted Software Engineering Agents with\u00a0Reinforcement Learning"],"prefix":"10.1007","author":[{"given":"Gerhard","family":"Stenzel","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kyrill","family":"Schmid","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael","family":"K\u00f6lle","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Philipp","family":"Altmann","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Marian","family":"Lingsch-Rosenfeld","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Maximilian","family":"Zorn","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tim","family":"B\u00fccher","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thomas","family":"Gabor","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Martin","family":"Wirsing","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lenz","family":"Belzner","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,30]]},"reference":[{"key":"8_CR1","doi-asserted-by":"crossref","unstructured":"Belzner, L., Gabor, T., Wirsing, M.: Large language model assisted software engineering: prospects, challenges, and a case study. In: International Conference on Bridging the Gap between AI and Reality, pp. 355\u2013374. Springer (2023)","DOI":"10.1007\/978-3-031-46002-9_23"},{"key":"8_CR2","unstructured":"Cassano, F., et al.: Multipl-e: a scalable and extensible approach to benchmarking neural code generation (2022)"},{"key":"8_CR3","doi-asserted-by":"publisher","unstructured":"Chang, Y., et al.: A survey on evaluation of large language models. ACM Trans. Intell. Syst. Technol. 15(3), March 2024. https:\/\/doi.org\/10.1145\/3641289","DOI":"10.1145\/3641289"},{"key":"8_CR4","unstructured":"Chen, M., et al.: Evaluating large language models trained on code (2021)"},{"key":"8_CR5","unstructured":"Cheng, Y., et al.: Exploring large language model based intelligent agents: Definitions, methods, and prospects (2024)"},{"key":"8_CR6","unstructured":"Du, M., Luu, A.T., Ji, B., Ng, S.K.: Mercury: an efficiency benchmark for llm code synthesis (2024)"},{"key":"8_CR7","unstructured":"Fernando, C., Banarse, D., Michalewski, H., Osindero, S., Rockt\u00e4schel, T.: Promptbreeder: self-referential self-improvement via prompt evolution. arXiv preprint arXiv:2309.16797 (2023)"},{"key":"8_CR8","doi-asserted-by":"crossref","unstructured":"Gioacchini, L., et al.: Agentquest: A modular benchmark framework to measure progress and improve llm agents (2024)","DOI":"10.18653\/v1\/2024.naacl-demo.19"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"Guo, T., et al.: Large language model based multi-agents: a survey of progress and challenges (2024)","DOI":"10.24963\/ijcai.2024\/890"},{"key":"8_CR10","unstructured":"Guo, Z., et al.: Evaluating large language models: a comprehensive survey (2023)"},{"key":"8_CR11","unstructured":"Hendrycks, D.,et al.: Measuring coding challenge competence with apps (2021)"},{"key":"8_CR12","unstructured":"Hong, S., et al.: MetaGPT: meta programming for a multi-agent collaborative framework (2023)"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Hou, X., et al.: Large language models for software engineering: a systematic literature review (2024)","DOI":"10.1145\/3695988"},{"key":"8_CR14","unstructured":"Huang, D., Zhang, J.M., Luck, M., Bu, Q., Qing, Y., Cui, H.: Agentcoder: multi-agent-based code generation with iterative testing and optimisation (2024)"},{"key":"8_CR15","unstructured":"Huang, D., Zhang, J.M., Qing, Y., Cui, H.: Effibench: Benchmarking the efficiency of automatically generated code (2024)"},{"key":"8_CR16","unstructured":"Jain, N., et al.: Livecodebench: holistic and contamination free evaluation of large language models for code (2024)"},{"key":"8_CR17","unstructured":"Jimenez, C.E., et al.: SWE-bench: can language models resolve real-world github issues? In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=VTF8yNQM66"},{"key":"8_CR18","doi-asserted-by":"crossref","unstructured":"Lavie, A., Agarwal, A.: Meteor: an automatic metric for mt evaluation with high levels of correlation with human judgments, pp. 228\u2013231. Association for Computational Linguistics, USA (2007)","DOI":"10.3115\/1626355.1626389"},{"key":"8_CR19","unstructured":"Li, B., et al.: Devbench: a comprehensive benchmark for software development (2024)"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Li, J., Li, G., Zhang, X., Dong, Y., Jin, Z.: Evocodebench: an evolving code generation benchmark aligned with real-world code repositories (2024)","DOI":"10.18653\/v1\/2024.findings-acl.214"},{"key":"8_CR21","unstructured":"Liu, X., et al.: Agentbench: evaluating llms as agents (2023)"},{"key":"8_CR22","unstructured":"Liu, Z., et al.: Agentlite: a lightweight library for building and advancing task-oriented llm agent system (2024)"},{"key":"8_CR23","unstructured":"Lozhkov, A., et al.: Starcoder 2 and the stack v2: the next generation (2024)"},{"key":"8_CR24","unstructured":"Packer, C., et al.: Memgpt: towards llms as operating systems (2024)"},{"key":"8_CR25","doi-asserted-by":"publisher","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation, pp. 311\u2013318. Association for Computational Linguistics, USA (2002). https:\/\/doi.org\/10.3115\/1073083.1073135","DOI":"10.3115\/1073083.1073135"},{"key":"8_CR26","unstructured":"Qian, C., et al.: Communicative agents for software development (2023)"},{"key":"8_CR27","unstructured":"Ren, S., et al.: Codebleu: a method for automatic evaluation of code synthesis (2020)"},{"key":"8_CR28","unstructured":"Ridnik, T., Kredo, D., Friedman, I.: Code generation with alphacodium: from prompt engineering to flow engineering (2024)"},{"issue":"7995","key":"8_CR29","doi-asserted-by":"publisher","first-page":"468","DOI":"10.1038\/s41586-023-06924-6","volume":"625","author":"B Romera-Paredes","year":"2024","unstructured":"Romera-Paredes, B., et al.: Mathematical discoveries from program search with large language models. Nature 625(7995), 468\u2013475 (2024)","journal-title":"Nature"},{"key":"8_CR30","unstructured":"Sai, A.B., Mohankumar, A.K., Khapra, M.M.: A survey of evaluation metrics used for nlg systems (2020)"},{"key":"8_CR31","unstructured":"Si, C., Zhang, Y., Yang, Z., Liu, R., Yang, D.: Design2code: how far are we from automating front-end engineering? (2024)"},{"key":"8_CR32","unstructured":"Tao, W., Zhou, Y., Zhang, W., Cheng, Y.: Magis: Llm-based multi-agent framework for github issue resolution (2024)"},{"key":"8_CR33","doi-asserted-by":"publisher","unstructured":"Towers, M., et al.: Gymnasium, March 2023. https:\/\/doi.org\/10.5281\/zenodo.8127026, https:\/\/zenodo.org\/record\/8127025","DOI":"10.5281\/zenodo.8127026"},{"key":"8_CR34","doi-asserted-by":"publisher","unstructured":"Wang, L., et al.: A survey on large language model based autonomous agents. Frontiers Comput. Sci. 18(6), March 2024. https:\/\/doi.org\/10.1007\/s11704-024-40231-1","DOI":"10.1007\/s11704-024-40231-1"},{"key":"8_CR35","unstructured":"Wu, Q., et al.: Autogen: enabling next-gen llm applications via multi-agent conversation (2023)"},{"key":"8_CR36","unstructured":"Xie, Y., Xie, A., Sheth, D., Liu, P., Fried, D., Rose, C.: Codebenchgen: creating scalable execution-based code generation benchmarks (2024)"},{"key":"8_CR37","doi-asserted-by":"crossref","unstructured":"Yadav, A., Singh, M.: Pythonsaga: Redefining the benchmark to evaluate code generating llm (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.996"},{"key":"8_CR38","unstructured":"Yang, H., Yue, S., He, Y.: Auto-gpt for online decision making: Benchmarks and additional opinions (2023)"},{"key":"8_CR39","unstructured":"Yang, J., et al.: Swe-agent: agent computer interfaces enable software engineering language models (2024)"},{"key":"8_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, F., et al.: Repocoder: repository-level code completion through iterative retrieval and generation (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.151"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Zhang, K., Li, J., Li, G., Shi, X., Jin, Z.: Codeagent: enhancing code generation with tool-integrated agent systems for real-world repo-level coding challenges (2024)","DOI":"10.18653\/v1\/2024.acl-long.737"},{"key":"8_CR42","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: Bertscore: Evaluating text generation with bert (2020)"},{"key":"8_CR43","unstructured":"Zheng, Z., et al.: A survey of large language models for code: Evolution, benchmarking, and future trends (2024)"},{"key":"8_CR44","unstructured":"Zhuo, T.Y.: Ice-score: instructing large language models to evaluate code (2024)"}],"container-title":["Lecture Notes in Computer Science","Bridging the Gap Between AI and Reality"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-75434-0_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,29]],"date-time":"2024-12-29T10:02:31Z","timestamp":1735466551000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-75434-0_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,30]]},"ISBN":["9783031754333","9783031754340"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-75434-0_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,30]]},"assertion":[{"value":"30 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"AISoLA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Bridging the Gap between AI and Reality","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Crete","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"aisola2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2023-aisola.isola-conference.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}