{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T06:28:56Z","timestamp":1768112936136,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":33,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819544448","type":"print"},{"value":"9789819544455","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-4445-5_34","type":"book-chapter","created":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T03:44:23Z","timestamp":1768103063000},"page":"500-514","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluate LLMs on\u00a0Code Review Understanding: A Multi-step Reasoning Benchmark"],"prefix":"10.1007","author":[{"given":"Xue","family":"Wei","sequence":"first","affiliation":[]},{"given":"Junchang","family":"Xin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,12]]},"reference":[{"key":"34_CR1","unstructured":"Austin, J., et al.: Program synthesis with large language models (2021)"},{"key":"34_CR2","doi-asserted-by":"crossref","unstructured":"Bacchelli, A., Bird, C.: Expectations, outcomes, and challenges of modern code review. In: 2013 35th International Conference on Software Engineering (ICSE), pp. 712\u2013721. IEEE (2013)","DOI":"10.1109\/ICSE.2013.6606617"},{"issue":"4","key":"34_CR3","first-page":"94","volume":"27","author":"S Baltes","year":"2022","unstructured":"Baltes, S., Ralph, P.: Sampling in software engineering research: a critical review and guidelines. EMSE 27(4), 94 (2022)","journal-title":"EMSE"},{"key":"34_CR4","doi-asserted-by":"crossref","unstructured":"Bosu, A., Greiler, M., Bird, C.: Characteristics of useful code reviews: an empirical study at microsoft. In: 2015 IEEE\/ACM 12th Working Conference on Mining Software Repositories, pp. 146\u2013156. IEEE (2015)","DOI":"10.1109\/MSR.2015.21"},{"key":"34_CR5","unstructured":"Carlini, N., Ippolito, D., Jagielski, M., Lee, K., Tramer, F., Zhang, C.: Quantifying memorization across neural language models. arXiv preprint arXiv:2202.07646 (2023)"},{"key":"34_CR6","unstructured":"Carlini, N., et al.: Extracting training data from large language models. In: 30th USENIX Security Symposium (USENIX Security 2021), pp. 2633\u20132650 (2021)"},{"key":"34_CR7","unstructured":"Chen, M., et al.: Evaluating large language models trained on code (2021)"},{"key":"34_CR8","doi-asserted-by":"crossref","unstructured":"Efstathiou, V., Spinellis, D.: Code review comments: language matters. In: Proceedings of the 40th International Conference on Software Engineering: New Ideas and Emerging Results, pp. 69\u201372 (2018)","DOI":"10.1145\/3183399.3183411"},{"key":"34_CR9","doi-asserted-by":"crossref","unstructured":"Evtikhiev, M., Bogomolov, E., Sokolov, Y., Bryksin, T.: Out of the bleu: how should we assess quality of the code generation models? In: Journal of Systems and Software, vol.\u00a0203, p. 111741 (2023)","DOI":"10.1016\/j.jss.2023.111741"},{"key":"34_CR10","doi-asserted-by":"crossref","unstructured":"Guo, Q., et al.: Exploring the potential of chatgpt in automated code refinement: an empirical study. In: ICSE, pp. 1\u201313 (2024)","DOI":"10.1145\/3597503.3623306"},{"key":"34_CR11","unstructured":"Hendrycks, D., et al.: Measuring massive multitask language understanding. In: ICLR (2021)"},{"key":"34_CR12","doi-asserted-by":"crossref","unstructured":"Kwon, W., et al.: Efficient memory management for large language model serving with pagedattention. In: Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles (2023)","DOI":"10.1145\/3600006.3613165"},{"key":"34_CR13","doi-asserted-by":"crossref","unstructured":"Lee, K., et al.: Deduplicating training data makes language models better. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics, pp. 8424\u20138445 (2022)","DOI":"10.18653\/v1\/2022.acl-long.577"},{"key":"34_CR14","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Automating code review activities by large-scale pre-training. In: ESEC\/FSE, pp. 1035\u20131047 (2022)","DOI":"10.1145\/3540250.3549081"},{"key":"34_CR15","doi-asserted-by":"crossref","unstructured":"Magar, I., Schwartz, R.: Data contamination: from memorization to exploitation. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics, pp. 157\u2013165 (2022)","DOI":"10.18653\/v1\/2022.acl-short.18"},{"issue":"5","key":"34_CR16","doi-asserted-by":"publisher","first-page":"2146","DOI":"10.1007\/s10664-015-9381-9","volume":"21","author":"S McIntosh","year":"2016","unstructured":"McIntosh, S., Kamei, Y., Adams, B., Hassan, A.E.: An empirical study of the impact of modern code review practices on software quality. Empir. Softw. Eng. 21(5), 2146\u20132189 (2016)","journal-title":"Empir. Softw. Eng."},{"key":"34_CR17","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"34_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/j.infsof.2024.107523","volume":"175","author":"C Pornprasit","year":"2024","unstructured":"Pornprasit, C., Tantithamthavorn, C.: Fine-tuning and prompt engineering for large language models-based code review automation. Inf. Softw. Technol. 175, 107523 (2024)","journal-title":"Inf. Softw. Technol."},{"key":"34_CR19","unstructured":"Ren, S., et al.: Codebleu: a method for automatic evaluation of code synthesis. arXiv preprint arXiv:2009.10297 (2020)"},{"key":"34_CR20","doi-asserted-by":"crossref","unstructured":"Rigby, P.C., Bird, C.: Convergent contemporary software peer review practices. In: Proceedings of the 2013 9th Joint Meeting on Foundations of Software Engineering, pp. 202\u2013212 (2013)","DOI":"10.1145\/2491411.2491444"},{"key":"34_CR21","unstructured":"Robinson, J., Wingate, D.: Leveraging large language models for multiple choice question answering. In: ICLR (2023)"},{"key":"34_CR22","doi-asserted-by":"crossref","unstructured":"Roy, D., Fakhoury, S., Arnaoudova, V.: Reassessing automatic evaluation metrics for code summarization tasks. In: Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering, pp. 1105\u20131116 (2021)","DOI":"10.1145\/3468264.3468588"},{"key":"34_CR23","doi-asserted-by":"crossref","unstructured":"Sainz, O., Campos, J.A., Garc\u00eda-Ferrero, I., Etxaniz, J., Lacalle, O.L.D., Agirre, E.: NLP evaluation in trouble: on the need to measure LLM data contamination for each benchmark. arXiv preprint arXiv:2310.18018 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.722"},{"key":"34_CR24","doi-asserted-by":"crossref","unstructured":"Sallou, J., Durieux, T., Panichella, A.: Breaking the silence: the threats of using LLMs in software engineering. In: Proceedings of the 2024 ACM\/IEEE 44th International Conference on Software Engineering: New Ideas and Emerging Results, pp. 102\u2013106 (2024)","DOI":"10.1145\/3639476.3639764"},{"key":"34_CR25","doi-asserted-by":"crossref","unstructured":"Thongtanunam, P., Pornprasit, C., Tantithamthavorn, C.: Autotransform: automated code transformation to support modern code review process. In: Proceedings of the 44th International Conference on Software Engineering, pp. 237\u2013248 (2022)","DOI":"10.1145\/3510003.3510067"},{"key":"34_CR26","doi-asserted-by":"crossref","unstructured":"Tufano, M., Pantiuchina, J., Watson, C., Bavota, G., Poshyvanyk, D.: On learning meaningful code changes via neural machine translation. In: Proceedings of the 41st International Conference on Software Engineering, pp. 25\u201336 (2019)","DOI":"10.1109\/ICSE.2019.00021"},{"issue":"2","key":"34_CR27","first-page":"338","volume":"50","author":"R Tufano","year":"2024","unstructured":"Tufano, R., Dabi\u0107, O., Mastropaolo, A., Ciniselli, M., Bavota, G.: Code review automation: strengths and weaknesses of the state of the art. TSE 50(2), 338\u2013353 (2024)","journal-title":"TSE"},{"key":"34_CR28","doi-asserted-by":"crossref","unstructured":"Tufano, R., Masiero, S., Mastropaolo, A., Pascarella, L., Poshyvanyk, D., Bavota, G.: Using pre-trained models to boost code review automation. In: 44th IEEE\/ACM 44th International Conference on Software Engineering, pp. 2291\u20132302 (2022)","DOI":"10.1145\/3510003.3510621"},{"key":"34_CR29","doi-asserted-by":"crossref","unstructured":"Tufano, R., Pascarella, L., Tufano, M., Poshyvanyk, D., Bavota, G.: Towards automating code review activities. In: 43rd IEEE\/ACM International Conference on Software Engineering, pp. 163\u2013174 (2021)","DOI":"10.1109\/ICSE43902.2021.00027"},{"key":"34_CR30","doi-asserted-by":"crossref","unstructured":"Yang, L., Xu, J., Zhang, Y., Zhang, H., Bacchelli, A.: Evacrc: evaluating code review comments. In: Proceedings of the 31st ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering, pp. 275\u2013287 (2023)","DOI":"10.1145\/3611643.3616245"},{"key":"34_CR31","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: Bertscore: evaluating text generation with BERT. arXiv preprint arXiv:1904.09675 (2019)"},{"key":"34_CR32","doi-asserted-by":"crossref","unstructured":"Zhu, W., et al.: Cleaneval: clean evaluation on contaminated large language models. In: Findings of the Association for Computational Linguistics: NAACL 2024, pp. 835\u2013847 (2024)","DOI":"10.18653\/v1\/2024.findings-naacl.53"},{"key":"34_CR33","unstructured":"Zhuo, T.Y., et al.: Bigcodebench: benchmarking code generation with diverse function calls and complex instructions. arXiv preprint arXiv:2406.15877 (2024)"}],"container-title":["Lecture Notes in Computer Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-4445-5_34","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T03:44:28Z","timestamp":1768103068000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-4445-5_34"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819544448","9789819544455"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-4445-5_34","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"12 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Okinawa","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 November 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"32","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iconip2025.apnns.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}