{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,22]],"date-time":"2025-12-22T22:14:57Z","timestamp":1766441697803,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":91,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3719027.3744855","type":"proceedings-article","created":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T23:32:38Z","timestamp":1763854358000},"page":"603-617","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Measuring and Augmenting Large Language Models for Solving Capture-the-Flag Challenges"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7014-9030","authenticated-orcid":false,"given":"Zimo","family":"Ji","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3752-0718","authenticated-orcid":false,"given":"Daoyuan","family":"Wu","sequence":"additional","affiliation":[{"name":"Lingnan University, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4646-7960","authenticated-orcid":false,"given":"Wenyuan","family":"Jiang","sequence":"additional","affiliation":[{"name":"D-INFK, ETH Zurich, Zurich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7680-2817","authenticated-orcid":false,"given":"Pingchuan","family":"Ma","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9897-4086","authenticated-orcid":false,"given":"Zongjie","family":"Li","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0866-0308","authenticated-orcid":false,"given":"Shuai","family":"Wang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,22]]},"reference":[{"volume-title":"Cybercrime To Cost The World $10.5 Trillion Annually By","year":"2025","unstructured":"2020. Cybercrime To Cost The World $10.5 Trillion Annually By 2025. https:\/\/cybersecurityventures.com\/cybercrime-will-cost-the-world-16--4-billion-a-day-in-2021\/","key":"e_1_3_2_1_1_1"},{"unstructured":"2023. AI Cyber Challenge Opens Registration Adds $4 Million in Prizes Shows Scoring Algorithm and Challenge Exemplar. https:\/\/www.darpa.mil\/news\/2023\/ai-cyber-challenge-opens","key":"e_1_3_2_1_2_1"},{"volume-title":"DEF CON\u00ae 27 Hacking Conference Contests & Events. https:\/\/defcon.org\/html\/defcon-27\/dc-27-ce.html","unstructured":"2023. DEF CON\u00ae 27 Hacking Conference Contests & Events. https:\/\/defcon.org\/html\/defcon-27\/dc-27-ce.html","key":"e_1_3_2_1_3_1"},{"unstructured":"2024. 0CTF 2024. https:\/\/ctf.0ops.sjtu.cn\/. https:\/\/ctf.0ops.sjtu.cn\/","key":"e_1_3_2_1_4_1"},{"unstructured":"2024. All about CTF. https:\/\/ctftime.org\/. https:\/\/ctftime.org\/","key":"e_1_3_2_1_5_1"},{"unstructured":"2024. Assistants API Overview. https:\/\/platform.openai.com\/docs\/assistants\/overview?context=with-streaming. https:\/\/platform.openai.com\/docs\/assistants\/overview?context=with-streaming","key":"e_1_3_2_1_6_1"},{"unstructured":"2024. BUUCTF. https:\/\/buuoj.cn\/. https:\/\/buuoj.cn\/","key":"e_1_3_2_1_7_1"},{"unstructured":"2024. Capture the Flag. https:\/\/www.csaw.io\/ctf. https:\/\/www.csaw.io\/ctf","key":"e_1_3_2_1_8_1"},{"unstructured":"2024. Capture the Flag for Empowered Cybersecurity Training. https:\/\/ine.com\/blog\/capture-the-flag-for-empowered-cybersecurity-training","key":"e_1_3_2_1_9_1"},{"unstructured":"2024. CGC: Cyber Grand Challenge. https:\/\/www.darpa.mil\/research\/programs\/cyber-grand-challenge","key":"e_1_3_2_1_10_1"},{"unstructured":"2024. Claude 3.5 Sonnet. hhttps:\/\/www.anthropic.com\/news\/claude-3--5-sonnet. https:\/\/www.anthropic.com\/news\/claude-3--5-sonnet","key":"e_1_3_2_1_11_1"},{"unstructured":"2024. DeepSeek. https:\/\/www.deepseek.com\/","key":"e_1_3_2_1_12_1"},{"unstructured":"2024. DEFCON. https:\/\/defcon.org\/. https:\/\/defcon.org\/","key":"e_1_3_2_1_13_1"},{"unstructured":"2024. Function calling. https:\/\/platform.openai.com\/docs\/guides\/function-calling. https:\/\/platform.openai.com\/docs\/guides\/function-calling","key":"e_1_3_2_1_14_1"},{"unstructured":"2024. Ghidra. https:\/\/ghidra-sre.org\/. https:\/\/ghidra-sre.org\/","key":"e_1_3_2_1_15_1"},{"unstructured":"2024. Google CTF. https:\/\/capturetheflag.withgoogle.com\/. https:\/\/capturetheflag.withgoogle.com\/","key":"e_1_3_2_1_16_1"},{"unstructured":"2024. gpt-3--5-turbo. https:\/\/platform.openai.com\/docs\/models\/gpt-3--5-turbo. https:\/\/platform.openai.com\/docs\/models\/gpt-3--5-turbo","key":"e_1_3_2_1_17_1"},{"unstructured":"2024. gpt-4. https:\/\/platform.openai.com\/docs\/models\/gpt-4-turbo-and-gpt-4. https:\/\/platform.openai.com\/docs\/models\/gpt-4-turbo-and-gpt-4","key":"e_1_3_2_1_18_1"},{"unstructured":"2024. gpt-4o. https:\/\/platform.openai.com\/docs\/models\/gpt-4o. https:\/\/platform.openai.com\/docs\/models\/gpt-4o","key":"e_1_3_2_1_19_1"},{"unstructured":"2024. HITCON. https:\/\/hitcon.org\/2024\/CMT\/. https:\/\/hitcon.org\/2024\/CMT\/","key":"e_1_3_2_1_20_1"},{"unstructured":"2024. IDA. https:\/\/hex-rays.com\/ida-pro\/","key":"e_1_3_2_1_21_1"},{"unstructured":"2024. Learning to Reason with LLMs | OpenAI. https:\/\/openai.com\/index\/learning-to-reason-with-llms\/","key":"e_1_3_2_1_22_1"},{"volume-title":"Meet Llama 3.1. https:\/\/llama.meta.com\/. https:\/\/llama.meta.com\/","unstructured":"2024. Meet Llama 3.1. https:\/\/llama.meta.com\/. https:\/\/llama.meta.com\/","key":"e_1_3_2_1_23_1"},{"unstructured":"2024. Mixtral of experts | Mistral AI | Frontier AI in your hands. https:\/\/mistral.ai\/news\/mixtral-of-experts\/. https:\/\/mistral.ai\/news\/mixtral-of-experts\/","key":"e_1_3_2_1_24_1"},{"unstructured":"2024. picoCTF - Carnegie Mellon University Cybersecurity Competition. https:\/\/picoctf.org\/. https:\/\/picoctf.org\/","key":"e_1_3_2_1_25_1"},{"unstructured":"2024. picoCTF2024. https:\/\/play.picoctf.org\/events\/73\/scoreboards. https:\/\/play.picoctf.org\/events\/73\/scoreboards","key":"e_1_3_2_1_26_1"},{"unstructured":"2024. Top 10 Cyber Hacking Competitions - Capture the Flag (CTF). https:\/\/www.geeksforgeeks.org\/top-cyber-hacking-competitions-capture-the-flag-ctf\/","key":"e_1_3_2_1_27_1"},{"volume-title":"UIUCTF 2024","year":"2024","unstructured":"2024. UIUCTF 2024. https:\/\/2024.uiuc.tf\/. https:\/\/2024.uiuc.tf\/","key":"e_1_3_2_1_28_1"},{"unstructured":"2024. VicOne & Block Harbor Spearhead Biggest Automotive Capture the Flag Competition for Cybersecurity Enthusiasts Worldwide. https:\/\/vicone.com\/company\/press-releases\/vicone-and-block-harbor-spearhead-biggest-automotive-capture-the-flag-competition-for-cybersecurity-enthusiasts-worldwide","key":"e_1_3_2_1_29_1"},{"unstructured":"2025. Burp Suite - Application Security Testing Software. https:\/\/portswigger.net\/burp","key":"e_1_3_2_1_30_1"},{"unstructured":"2025. Microsoft Security Copilot Blog. https:\/\/techcommunity.microsoft.com\/blog\/securitycopilotblog\/advancing-security-copilot-with-magic-automating-self-correction-in-nl2kql-and-b\/4390932","key":"e_1_3_2_1_31_1"},{"unstructured":"2025. Proactive Defense: The Role of Offensive Security in Cybersecurity. https:\/\/cloudsecurityalliance.org\/artifacts\/using-ai-for-offensive-security","key":"e_1_3_2_1_32_1"},{"unstructured":"2025. Using AI for Offensive Security. https:\/\/cloudsecurityalliance.org\/artifacts\/using-ai-for-offensive-security","key":"e_1_3_2_1_33_1"},{"unstructured":"2025. What is Automated Vulnerability Remediation? https:\/\/www.sentinelone.com\/cybersecurity-101\/cybersecurity\/what-is-automated-vulnerability-remediation\/","key":"e_1_3_2_1_34_1"},{"key":"e_1_3_2_1_35_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv:2303.08774 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_36_1","DOI":"10.1609\/aaai.v38i16.29720"},{"unstructured":"Manish Bhatt Sahana Chennabasappa Yue Li Cyrus Nikolaidis Daniel Song Shengye Wan Faizan Ahmad Cornelius Aschermann Yaohui Chen Dhaval Kapil et al. 2024. Cyberseceval 2: A wide-ranging cybersecurity evaluation suite for large language models. arXiv preprint arXiv:2404.13161 (2024).","key":"e_1_3_2_1_37_1"},{"key":"e_1_3_2_1_38_1","first-page":"6","article-title":"The cyber grand challenge and the future of cyber-autonomy","volume":"43","author":"Brumley David","year":"2018","unstructured":"David Brumley. 2018. The cyber grand challenge and the future of cyber-autonomy. USENIX Login 43, 2 (2018), 6--9.","journal-title":"USENIX Login"},{"key":"e_1_3_2_1_39_1","volume-title":"USENIX Workshop on Advances in Security Education.","author":"Burns Tanner J","year":"2017","unstructured":"Tanner J Burns, Samuel C Rios, Thomas K Jordan, Qijun Gu, and Trevor Underwood. 2017. Analysis and exercises for engaging beginners in online CTF competitions for security education. In USENIX Workshop on Advances in Security Education."},{"key":"e_1_3_2_1_40_1","volume-title":"LLM for Mobile: An Initial Roadmap. arXiv preprint arXiv:2407.06573","author":"Chen Daihang","year":"2024","unstructured":"Daihang Chen, Yonghui Liu, Mingyi Zhou, Yanjie Zhao, Haoyu Wang, Shuai Wang, Xiao Chen, Tegawend\u00e9 F Bissyand\u00e9, Jacques Klein, and Li Li. 2024. LLM for Mobile: An Initial Roadmap. arXiv preprint arXiv:2407.06573 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"et al","author":"Chen Mingyang","year":"2025","unstructured":"Mingyang Chen, Tianpeng Li, Haoze Sun, Yijie Zhou, Chenzheng Zhu, Fan Yang, Zenan Zhou, Weipeng Chen, Haofen Wang, Jeff Z Pan, et al . 2025. Learning to Reason with Search for LLMs via Reinforcement Learning. arXiv preprint arXiv:2503.19470 (2025)."},{"unstructured":"Kevin Chung and Julian Cohen. 2014. Learning obstacles in the capture the flag model. In USENIX 3GSE.","key":"e_1_3_2_1_42_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_43_1","DOI":"10.1145\/3626772.3657834"},{"key":"e_1_3_2_1_44_1","volume-title":"Proc. USENIX Security.","author":"Deng Gelei","year":"2024","unstructured":"Gelei Deng, Yi Liu, V\u00edctor Mayoral-Vilches, Peng Liu, Yuekang Li, Yuan Xu, Tianwei Zhang, Yang Liu, Martin Pinzger, and Stefan Rass. 2024. PentestGPT: Evaluating and harnessing large language models for automated penetration testing. In Proc. USENIX Security."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_45_1","DOI":"10.1145\/3597926.3598067"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_46_1","DOI":"10.1145\/3597503.3623343"},{"key":"e_1_3_2_1_47_1","volume-title":"Teams of LLM Agents can Exploit Zero-Day Vulnerabilities. arXiv preprint arXiv:2406.01637","author":"Fang Richard","year":"2024","unstructured":"Richard Fang, Rohan Bindu, Akul Gupta, Qiusi Zhan, and Daniel Kang. 2024. Teams of LLM Agents can Exploit Zero-Day Vulnerabilities. arXiv preprint arXiv:2406.01637 (2024)."},{"key":"e_1_3_2_1_48_1","volume-title":"Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997","author":"Gao Yunfan","year":"2023","unstructured":"Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, and Haofen Wang. 2023. Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_49_1","DOI":"10.1145\/3611643.3613083"},{"key":"e_1_3_2_1_50_1","volume-title":"Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_1_51_1","volume-title":"PenHeal: A Two-Stage LLM Framework for Automated Pentesting and Optimal Remediation. arXiv:2407.17788","author":"Huang Junjie","year":"2024","unstructured":"Junjie Huang and Quanyan Zhu. 2024. PenHeal: A Two-Stage LLM Framework for Automated Pentesting and Optimal Remediation. arXiv:2407.17788 (2024)."},{"unstructured":"Patrick Hulin Andy Davis Rahul Sridhar Andrew Fasano Cody Gallagher Aaron Sedlacek Tim Leek and Brendan Dolan-Gavitt. 2017. {AutoCTF}: Creating diverse pwnables via automated bug injection. In WOOT.","key":"e_1_3_2_1_52_1"},{"key":"e_1_3_2_1_53_1","volume-title":"SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity. arXiv:2412.20787","author":"Jing Pengfei","year":"2024","unstructured":"Pengfei Jing, Mengyun Tang, Xiaorong Shi, Xing Zheng, Sen Nie, Shi Wu, Yong Yang, and Xiapu Luo. 2024. SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity. arXiv:2412.20787 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Understanding the effectiveness of large language models in detecting security vulnerabilities. arXiv preprint arXiv:2311.16169","author":"Khare Avishree","year":"2023","unstructured":"Avishree Khare, Saikat Dutta, Ziyang Li, Alaia Solko-Breslin, Rajeev Alur, and Mayur Naik. 2023. Understanding the effectiveness of large language models in detecting security vulnerabilities. arXiv preprint arXiv:2311.16169 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_55_1","DOI":"10.1016\/j.cose.2022.103009"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_56_1","DOI":"10.1016\/j.jnca.2019.102470"},{"key":"e_1_3_2_1_57_1","volume-title":"Torl: Scaling tool-integrated rl. arXiv preprint arXiv:2503.23383","author":"Li Xuefeng","year":"2025","unstructured":"Xuefeng Li, Haoyang Zou, and Pengfei Liu. 2025. Torl: Scaling tool-integrated rl. arXiv preprint arXiv:2503.23383 (2025)."},{"unstructured":"Yuanchun Li Hao Wen Weijun Wang Xiangyu Li Yizhen Yuan Guohong Liu Jiacheng Liu Wenxing Xu Xiang Wang Yi Sun et al. 2024. Personal llm agents: Insights and survey about the capability efficiency and security. arXiv preprint arXiv:2401.05459 (2024).","key":"e_1_3_2_1_58_1"},{"key":"e_1_3_2_1_59_1","volume-title":"Proc. ACM\/IEEE IPSN.","author":"Li Zongjie","year":"2024","unstructured":"Zongjie Li, Wenying Qiu, Pingchuan Ma, Yichen Li, You Li, Sijia He, Baozheng Jiang, Shuai Wang, and Weixi Gu. 2024. On the Accuracy and Robustness of Large Language Models in Chinese Industrial Scenarios. In Proc. ACM\/IEEE IPSN."},{"key":"e_1_3_2_1_60_1","volume-title":"et al","author":"Li Zhong-Zhi","year":"2025","unstructured":"Zhong-Zhi Li, Duzhen Zhang, Ming-Liang Zhang, Jiaxin Zhang, Zengyan Liu, Yuxuan Yao, Haotian Xu, Junhao Zheng, Pei-Jie Wang, Xiuyi Chen, et al . 2025. From system 1 to system 2: A survey of reasoning large language models. arXiv preprint arXiv:2502.17419 (2025)."},{"key":"e_1_3_2_1_61_1","volume-title":"Foundation Agents: From Brain-Inspired Intelligence to Evolutionary, Collaborative, and Safe Systems. arXiv preprint arXiv:2504.01990","author":"Liu Bang","year":"2025","unstructured":"Bang Liu, Xinfeng Li, Jiayi Zhang, Jinlin Wang, Tanjin He, Sirui Hong, Hongzhang Liu, Shaokun Zhang, Kaitao Song, Kunlun Zhu, et al. 2025. Advances and Challenges in Foundation Agents: From Brain-Inspired Intelligence to Evolutionary, Collaborative, and Safe Systems. arXiv preprint arXiv:2504.01990 (2025)."},{"key":"e_1_3_2_1_62_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems 36","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_63_1","DOI":"10.14722\/ndss.2025.241357"},{"unstructured":"Pingchuan Ma Rui Ding Shuai Wang Shi Han and Dongmei Zhang. 2023. InsightPilot: An LLM-empowered automated data exploration system. In EMNLP: System Demonstrations.","key":"e_1_3_2_1_64_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_65_1","DOI":"10.1109\/ICSE55347.2025.00027"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_66_1","DOI":"10.14722\/ndss.2024.24556"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_67_1","DOI":"10.1109\/SP46215.2023.10179420"},{"key":"e_1_3_2_1_68_1","volume-title":"An empirical evaluation of llms for solving offensive security challenges. arXiv:2402.11814","author":"Shao Minghao","year":"2024","unstructured":"Minghao Shao, Boyuan Chen, Sofija Jancheska, Brendan Dolan-Gavitt, Siddharth Garg, Ramesh Karri, and Muhammad Shafique. 2024. An empirical evaluation of llms for solving offensive security challenges. arXiv:2402.11814 (2024)."},{"key":"e_1_3_2_1_69_1","volume-title":"Meet Udeshi, Brendan Dolan-Gavitt, Haoran Xi, Kimberly Milner, Boyuan Chen, Max Yin, Siddharth Garg, Prashanth Krishnamurthy, et al.","author":"Shao Minghao","year":"2024","unstructured":"Minghao Shao, Sofija Jancheska, Meet Udeshi, Brendan Dolan-Gavitt, Haoran Xi, Kimberly Milner, Boyuan Chen, Max Yin, Siddharth Garg, Prashanth Krishnamurthy, et al. 2024. NYU CTF Dataset: A Scalable Open-Source Benchmark Dataset for Evaluating LLMs in Offensive Security. arXiv:2406.05590 (2024)."},{"key":"e_1_3_2_1_70_1","volume-title":"Proceedings of The International MultiConference of Engineers and Computer Scientists.","author":"Soares Teotino Gomes","year":"2021","unstructured":"Teotino Gomes Soares, Azhari Azhari, Nur Rokhman, and E Wonarko. 2021. Education question answering systems: a survey. In Proceedings of The International MultiConference of Engineers and Computer Scientists."},{"key":"e_1_3_2_1_71_1","volume-title":"Thunder CTF: Learning Cloud Security on a Dime. arXiv preprint arXiv:2107.12566","author":"Springer Nicholas","year":"2021","unstructured":"Nicholas Springer and Wu-chang Feng. 2021. Thunder CTF: Learning Cloud Security on a Dime. arXiv preprint arXiv:2107.12566 (2021)."},{"key":"e_1_3_2_1_72_1","volume-title":"LLM4Vuln: A Unified Evaluation Framework for Decoupling and Enhancing LLMs' Vulnerability Reasoning. arXiv:2401.16185","author":"Sun Yuqiang","year":"2024","unstructured":"Yuqiang Sun, Daoyuan Wu, Yue Xue, Han Liu, Wei Ma, Lyuye Zhang, Miaolei Shi, and Yang Liu. 2024. LLM4Vuln: A Unified Evaluation Framework for Decoupling and Enhancing LLMs' Vulnerability Reasoning. arXiv:2401.16185 (2024)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_73_1","DOI":"10.1145\/3597503.3639117"},{"key":"e_1_3_2_1_74_1","volume-title":"Choon Meng Seah, and Ee-Chien Chang.","author":"Tann Wesley","year":"2023","unstructured":"Wesley Tann, Yuancheng Liu, Jun Heng Sim, Choon Meng Seah, and Ee-Chien Chang. 2023. Using large language models for cybersecurity capture-the-flag challenges and certification questions. arXiv preprint arXiv:2308.10443 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_75_1","DOI":"10.1145\/3564625.3567985"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_76_1","DOI":"10.1109\/CIEES62939.2024.10811132"},{"key":"e_1_3_2_1_77_1","volume-title":"Ridhi Jain, and Merouane Debbah.","author":"Tihanyi Norbert","year":"2024","unstructured":"Norbert Tihanyi, Mohamed Amine Ferrag, Ridhi Jain, and Merouane Debbah. 2024. Cybermetric: A benchmark dataset for evaluating large language models knowledge in cybersecurity. arXiv preprint arXiv:2402.07688 (2024)."},{"key":"e_1_3_2_1_78_1","volume-title":"Can large language models identify and reason about security vulnerabilities? not yet. arXiv preprint arXiv:2312.12575","author":"Ullah Saad","year":"2023","unstructured":"Saad Ullah, Mingji Han, Saurabh Pujar, Hammond Pearce, Ayse Coskun, and Gianluca Stringhini. 2023. Can large language models identify and reason about security vulnerabilities? not yet. arXiv preprint arXiv:2312.12575 (2023)."},{"unstructured":"U.S. Department of Health and Human Services. 2018. Federal Policy for the Protection of Human Subjects ('Common Rule'). https:\/\/www.ecfr.gov\/current\/title-45\/subtitle-A\/subchapter-A\/part-46#p-46.104(d)(2) Title 45 Code of Federal Regulations Part 46.104(d)(2).","key":"e_1_3_2_1_79_1"},{"unstructured":"Jan Vykopal Valdemar \u0160v\u00e1bensk","key":"e_1_3_2_1_80_1"},{"key":"e_1_3_2_1_81_1","volume-title":"Proceedings of the 51st ACM Technical symposium on computer science education. 752--758","author":"Chang Ee-Chien","year":"2020","unstructured":"y, and Ee-Chien Chang. 2020. Benefits and pitfalls of using capture the flag games in university courses. In Proceedings of the 51st ACM Technical symposium on computer science education. 752--758."},{"key":"e_1_3_2_1_82_1","volume-title":"Benchmarking Multi-Modal LLMs for Testing Visual Deep Learning Systems Through the Lens of Image Mutation. arXiv preprint arXiv:2404.13945","author":"Wang Liwen","year":"2024","unstructured":"Liwen Wang, Yuanyuan Yuan, Ao Sun, Zongjie Li, Pingchuan Ma, Daoyuan Wu, and Shuai Wang. 2024. Benchmarking Multi-Modal LLMs for Testing Visual Deep Learning Systems Through the Lens of Image Mutation. arXiv preprint arXiv:2404.13945 (2024)."},{"key":"e_1_3_2_1_83_1","volume-title":"2018 USENIX Workshop on Advances in Security Education.","author":"Wi SeongIl","year":"2018","unstructured":"SeongIl Wi, Jaeseung Choi, and Sang Kil Cha. 2018. Git-based {CTF}: A Simple and Effective Approach to Organizing {In-Course} {Attack-and-Defense} Security Competition. In 2018 USENIX Workshop on Advances in Security Education."},{"key":"e_1_3_2_1_84_1","volume-title":"Autopwn: Artifact-assisted heap exploit generation for ctf pwn competitions","author":"Xu Dandan","year":"2023","unstructured":"Dandan Xu, Kai Chen, Miaoqian Lin, Chaoyang Lin, and Xiaofeng Wang. 2023. Autopwn: Artifact-assisted heap exploit generation for ctf pwn competitions. IEEE Transactions on Information Forensics and Security (2023)."},{"key":"e_1_3_2_1_85_1","volume-title":"Intercode: Standardizing and benchmarking interactive coding with execution feedback. Advances in Neural Information Processing Systems 36","author":"Yang John","year":"2024","unstructured":"John Yang, Akshara Prabhakar, Karthik Narasimhan, and Shunyu Yao. 2024. Intercode: Standardizing and benchmarking interactive coding with execution feedback. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_86_1","volume-title":"Multi-Agent Security Workshop@ NeurIPS'23","author":"Yang John","year":"2023","unstructured":"John Yang, Akshara Prabhakar, Shunyu Yao, Kexin Pei, and Karthik R Narasimhan. 2023. Language agents as hackers: Evaluating cybersecurity skills with capture the flag. In Multi-Agent Security Workshop@ NeurIPS'23."},{"key":"e_1_3_2_1_87_1","volume-title":"Tree of thoughts: Deliberate problem solving with large language models. NeurIPS","author":"Yao Shunyu","year":"2024","unstructured":"Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. 2024. Tree of thoughts: Deliberate problem solving with large language models. NeurIPS (2024)."},{"key":"e_1_3_2_1_88_1","volume-title":"React: Synergizing reasoning and acting in language models. arXiv preprint arXiv:2210.03629","author":"Yao Shunyu","year":"2022","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. 2022. React: Synergizing reasoning and acting in language models. arXiv preprint arXiv:2210.03629 (2022)."},{"key":"e_1_3_2_1_89_1","volume-title":"Chain-of-note: Enhancing robustness in retrieval-augmented language models. arXiv preprint arXiv:2311.09210","author":"Yu Wenhao","year":"2023","unstructured":"Wenhao Yu, Hongming Zhang, Xiaoman Pan, Kaixin Ma, Hongwei Wang, and Dong Yu. 2023. Chain-of-note: Enhancing robustness in retrieval-augmented language models. arXiv preprint arXiv:2311.09210 (2023)."},{"key":"e_1_3_2_1_90_1","volume-title":"Acfix: Guiding llms with mined common rbac practices for context-aware repair of access control vulnerabilities in smart contracts. arXiv preprint arXiv:2403.06838","author":"Zhang Lyuye","year":"2024","unstructured":"Lyuye Zhang, Kaixuan Li, Kairan Sun, Daoyuan Wu, Ye Liu, Haoye Tian, and Yang Liu. 2024. Acfix: Guiding llms with mined common rbac practices for context-aware repair of access control vulnerabilities in smart contracts. arXiv preprint arXiv:2403.06838 (2024)."},{"key":"e_1_3_2_1_91_1","volume-title":"Ldb: A large language model debugger via verifying runtime execution step-by-step. arXiv preprint arXiv:2402.16906","author":"Zhong Li","year":"2024","unstructured":"Li Zhong, Zilong Wang, and Jingbo Shang. 2024. Ldb: A large language model debugger via verifying runtime execution step-by-step. arXiv preprint arXiv:2402.16906 (2024)."}],"event":{"sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"],"acronym":"CCS '25","name":"CCS '25: ACM SIGSAC Conference on Computer and Communications Security","location":"Taipei Taiwan"},"container-title":["Proceedings of the 2025 ACM SIGSAC Conference on Computer and Communications Security"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3719027.3744855","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,22]],"date-time":"2025-12-22T22:10:12Z","timestamp":1766441412000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3719027.3744855"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":91,"alternative-id":["10.1145\/3719027.3744855","10.1145\/3719027"],"URL":"https:\/\/doi.org\/10.1145\/3719027.3744855","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2025-11-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}