{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T13:54:15Z","timestamp":1774360455129,"version":"3.50.1"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032212993","type":"print"},{"value":"9783032213006","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-21300-6_23","type":"book-chapter","created":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T12:59:30Z","timestamp":1774357170000},"page":"328-337","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating Large Language Models as\u00a0Domain-Specific Retrieval Agents: A Study on\u00a0Cybersecurity Challenge Benchmarks"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8989-9514","authenticated-orcid":false,"given":"Omed","family":"Abed","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7320-6379","authenticated-orcid":false,"given":"Md. Samiul","family":"Haque","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1780-8501","authenticated-orcid":false,"given":"Patrick-Benjamin","family":"B\u00f6k","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9127-969X","authenticated-orcid":false,"given":"Matteo","family":"Gro\u00dfe-Kampmann","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,3,25]]},"reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"Greshake, K., Abdelnabi, S., Mishra, S., Endres, C., Holz, T., Fritz, M.: Not what you\u2019ve signed up for: compromising real-world LLM-integrated applications with indirect prompt injection. In: Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, pp.\u00a079\u201390 (2023)","DOI":"10.1145\/3605764.3623985"},{"key":"23_CR2","unstructured":"Chilton, J.: The new risks ChatGPT poses to cybersecurity. Harv. Bus. Rev.\u00a021 (2023)"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"Zou, Y., Hong, Y., Xu, J., Liu, L., Fan, W.: Leveraging large language models for challenge solving in capture-the-flag. In: 2024 IEEE 23rd International Conference on Trust, Security and Privacy in Computing and Communications (TrustCom), pp.\u00a01541\u20131550. IEEE (2024)","DOI":"10.1109\/TrustCom63139.2024.00213"},{"key":"23_CR4","unstructured":"Shao, M., et al.: An empirical evaluation of LLMs for solving offensive security challenges. arXiv preprint arXiv:2402.11814 (2024)"},{"issue":"2","key":"23_CR5","doi-asserted-by":"publisher","first-page":"1186","DOI":"10.1109\/TDSC.2022.3151148","volume":"20","author":"N Sun","year":"2022","unstructured":"Sun, N., Zhang, J., Gao, S., Zhang, L.Y., Camtepe, S., Xiang, Y.: Cyber information retrieval through pragmatics understanding and visualization. IEEE Trans. Depend. Secure Comput. 20(2), 1186\u20131199 (2022)","journal-title":"IEEE Trans. Depend. Secure Comput."},{"key":"23_CR6","doi-asserted-by":"crossref","unstructured":"Engebretson, P., Kennedy, D.: The basics of hacking and penetration testing: ethical hacking and penetration testing made easy. In: Syngress (2013)","DOI":"10.1016\/B978-0-12-411644-3.00001-7"},{"key":"23_CR7","doi-asserted-by":"crossref","unstructured":"Weidman, G.: Penetration Testing: A Hands-On Introduction to Hacking, 1st edn. No Starch Press, San Francisco, California (2014)","DOI":"10.1201\/b17225-2"},{"key":"23_CR8","doi-asserted-by":"crossref","unstructured":"Ahmad, T., Butkovic, M., Truscan, D.: Using reinforcement learning for security testing: a systematic mapping study. In: 2025 IEEE International Conference on Software Testing, Verification and Validation Workshops (ICSTW), pp.\u00a0208\u2013216 (2025)","DOI":"10.1109\/ICSTW64639.2025.10962455"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Hu, Z., Beuran, R., Tan, Y.: Automated penetration testing using deep reinforcement learning. In: 2020 IEEE European Symposium on Security and Privacy Workshops (EuroS&PW), pp.\u00a02\u201310 (2020)","DOI":"10.1109\/EuroSPW51379.2020.00010"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Ghanem, M.C., Chen, T.M.: Reinforcement learning for intelligent penetration testing. In: 2018 Second World Conference on Smart Trends in Systems, Security and Sustainability (WorldS4), pp.\u00a0185\u2013192 (2018)","DOI":"10.1109\/WorldS4.2018.8611595"},{"key":"23_CR11","doi-asserted-by":"publisher","DOI":"10.1016\/j.cose.2020.102154","volume":"102","author":"V Svabensk\u00fd","year":"2020","unstructured":"Svabensk\u00fd, V., \u010celeda, P., Vykopal, J., Bri\u0161akov\u00e1, S.: Cybersecurity knowledge and skills taught in capture the flag challenges. Comput. Secur. 102, 102154 (2020)","journal-title":"Comput. Secur."},{"key":"23_CR12","unstructured":"Chen, C.K., Shieh, S.W.: CTF as education for practical offensive security skill. IEEE Reliab. Soc. Newsl. (2015)"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"Wylie, P.L., Crawley, K.: The Pentester Blueprint: Starting a Career as an Ethical Hacker. Wiley, New York (2020)","DOI":"10.1002\/9781119684367"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Zou, Y., Hong, Y., Xu, J., Liu, L., Fan, W.: Leveraging large language models for challenge solving in capture-the-flag. In: 2024 IEEE 23rd International Conference on Trust, Security and Privacy in Computing and Communications (TrustCom), pp. 1541\u20131550 (2024)","DOI":"10.1109\/TrustCom63139.2024.00213"},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"Nelson, C., Shoshitaishvili, Y.: PWN the learning curve: education-first CTF challenges. In: Proceedings of the 55th ACM Technical Symposium on Computer Science Education V. 1, SIGCSE 2024, (New York, NY, USA), pp.\u00a0937\u2013943. Association for Computing Machinery (2024)","DOI":"10.1145\/3626252.3630912"},{"key":"23_CR16","unstructured":"NYU Tandon School of Engineering: Cybersecurity Awareness Week (CSAW) (2024). https:\/\/www.csaw.io\/. Accessed 06 Apr 2025"},{"key":"23_CR17","unstructured":"Hack The Box (2025). https:\/\/www.hackthebox.com\/. Accessed 27 Apr 2025"},{"key":"23_CR18","unstructured":"HKCERT: HKCERT Capture the Flag Challenge 2024 (2024). https:\/\/www.hkcert.org\/event\/capture-the-flag-challenge-2024. Accessed 27 Apr 2025"},{"key":"23_CR19","unstructured":"Vaswani, A., et al.: Attention is all you need. arXiv preprint arXiv:1706.03762 (2017)"},{"key":"23_CR20","unstructured":"Minaee, S., et al.: Large language models: a survey. arXiv preprint"},{"key":"23_CR21","unstructured":"Kaplan, J., et al.: Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Ma, Z., Chen, A.R., Kim, D.J., Chen, T.-H., Wang, S.: Llmparser: an exploratory study on using large language models for log parsing. In: Proceedings of the IEEE\/ACM 46th International Conference on Software Engineering, pp.\u00a01\u201313 (2024)","DOI":"10.1145\/3597503.3639150"},{"key":"23_CR23","unstructured":"Fang, R., Bindu, R., Gupta, A., Kang, D.: LLM agents can autonomously exploit one-day vulnerabilities. arXiv preprint arXiv:2404.08144 (2024)"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Weinz, M., Zannone, N., Allodi, L., Apruzzese, G.: The impact of emerging phishing threats: assessing quishing and LLM-generated phishing emails against organizations. In: Proceedings of the 20th ACM Asia Conference on Computer and Communications Security, pp.\u00a01550\u20131566 (2025)","DOI":"10.1145\/3708821.3736195"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Pa\u00a0Pa, Y.M., Tanizaki, S., Kou, T., Van\u00a0Eeten, M., Yoshioka, K., Matsumoto, T.: An attacker\u2019s dream? exploring the capabilities of ChatGPT for developing malware. In: Proceedings of the 16th Cyber Security Experimentation and Test Workshop, pp.\u00a010\u201318 (2023)","DOI":"10.1145\/3607505.3607513"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Balayn, A., Yurrita, M., Rancourt, F., Casati, F., Gadiraju, U.: Unpacking trust dynamics in the LLM supply chain: an empirical exploration to foster trustworthy LLM production & use. In: Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems, pp.\u00a01\u201320 (2025)","DOI":"10.1145\/3706598.3713787"},{"key":"23_CR27","unstructured":"Shao, M., et al.: NYU CTF bench: a scalable open-source benchmark dataset for evaluating LLMs in offensive security (2025)"},{"key":"23_CR28","unstructured":"Yang, J., Prabhakar, A., Yao, S., Pei, K., Narasimhan, K.R.: Language agents as hackers: evaluating cybersecurity skills with capture the . In: Multi-agent Security Workshop@ NeurIPS\u201923 (2023)"},{"key":"23_CR29","unstructured":"Zhang, A.K., et\u00a0al.: Cybench: a framework for evaluating cybersecurity capabilities and risks of language models. arXiv preprint arXiv:2408.08926 (2024)"},{"issue":"2","key":"23_CR30","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1007\/BF02295996","volume":"12","author":"Q McNemar","year":"1947","unstructured":"McNemar, Q.: Note on the sampling error of the difference between correlated proportions or percentages. Psychometrika 12(2), 153\u2013157 (1947)","journal-title":"Psychometrika"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Casey, E., Chamberlain, D.: Capture the flag with ChatGPT: Security testing with AI chatbots. In: 19th International Conference on Cyber Warfare and Security: ICCWS, vol.\u00a02024 (2024)","DOI":"10.34190\/iccws.19.1.2171"},{"key":"23_CR33","unstructured":"Abramovich, T., et\u00a0al.: Enigma: enhanced interactive generative model agent for CTF challenges. arXiv preprint arXiv:2409.16165 (2024)"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Thaqi, A., Musa, A., Rexha, B.: Leveraging AI for CTF challenge optimization. In: 2024 5th International Conference on Communications, Information, Electronic and Energy Systems (CIEES), pp.\u00a01\u20135. IEEE (2024)","DOI":"10.1109\/CIEES62939.2024.10811132"}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-21300-6_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T12:59:46Z","timestamp":1774357186000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-21300-6_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032212993","9783032213006"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-21300-6_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"25 March 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.Disclosure of Interest","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Delft","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2026","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 March 2026","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 April 2026","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"48","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2026","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2026.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}