{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:46:38Z","timestamp":1774021598981,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,11,26]],"date-time":"2023-11-26T00:00:00Z","timestamp":1700956800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,11,30]]},"DOI":"10.1145\/3605764.3623916","type":"proceedings-article","created":{"date-parts":[[2023,11,21]],"date-time":"2023-11-21T12:12:17Z","timestamp":1700568737000},"page":"221-232","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":16,"title":["Reward Shaping for Happier Autonomous Cyber Security Agents"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5081-0052","authenticated-orcid":false,"given":"Elizabeth","family":"Bates","sequence":"first","affiliation":[{"name":"The Alan Turing Institute, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2667-5906","authenticated-orcid":false,"given":"Vasilios","family":"Mavroudis","sequence":"additional","affiliation":[{"name":"The Alan Turing Institute, London, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6340-004X","authenticated-orcid":false,"given":"Chris","family":"Hicks","sequence":"additional","affiliation":[{"name":"The Alan Turing Institute, London, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2023,11,26]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2021. Cyber Autonomy Gym for Experimentation Challenge 1. https:\/\/github. com\/cage-challenge\/cage-challenge-1. Created by Maxwell Standen David Bowman Son Hoang Toby Richer Martin Lucas Richard Van Tassel."},{"key":"e_1_3_2_2_2_1","unstructured":"2021. CybORG: A Gym for the Development of Autonomous Cyber Agents. arXiv."},{"key":"e_1_3_2_2_3_1","unstructured":"2022. Cyber Autonomy Gym for Experimentation Challenge 2. https:\/\/github. com\/cage-challenge\/cage-challenge-2. Created by Maxwell Standen David Bowman Son Hoang Toby Richer Martin Lucas Richard Van Tassel Phillip Vu Mitchell Kiely."},{"key":"e_1_3_2_2_4_1","unstructured":"2022. Cyber Operations Research Gym. https:\/\/github.com\/cage-challenge\/ CybORG. Created by Maxwell Standen David Bowman Son Hoang Toby Richer Martin Lucas Richard Van Tassel Phillip Vu Mitchell Kiely KC C. Natalie Konschnik Joshua Collyer."},{"key":"e_1_3_2_2_5_1","volume-title":"The P value and statistical significance: misunderstandings, explanations, challenges, and alternatives. Indian journal of psychological medicine 41, 3","author":"Andrade Chittaranjan","year":"2019","unstructured":"Chittaranjan Andrade. 2019. The P value and statistical significance: misunderstandings, explanations, challenges, and alternatives. Indian journal of psychological medicine 41, 3 (2019), 210--215."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3560830.3563732"},{"key":"e_1_3_2_2_7_1","volume-title":"Unifying count-based exploration and intrinsic motivation. Advances in neural information processing systems 29","author":"Bellemare Marc","year":"2016","unstructured":"Marc Bellemare, Sriram Srinivasan, Georg Ostrovski, Tom Schaul, David Saxton, and Remi Munos. 2016. Unifying count-based exploration and intrinsic motivation. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_2_8_1","unstructured":"Greg Brockman Vicki Cheung Ludwig Pettersson Jonas Schneider John Schulman Jie Tang and Wojciech Zaremba. 2016. OpenAI Gym. arXiv:arXiv:1606.01540"},{"key":"e_1_3_2_2_9_1","volume-title":"Large-scale study of curiosity-driven learning. arXiv preprint arXiv:1808.04355","author":"Burda Yuri","year":"2018","unstructured":"Yuri Burda, Harri Edwards, Deepak Pathak, Amos Storkey, Trevor Darrell, and Alexei A Efros. 2018. Large-scale study of curiosity-driven learning. arXiv preprint arXiv:1808.04355 (2018)."},{"key":"e_1_3_2_2_10_1","unstructured":"CAGE. 2022. TTCP CAGE Challenge 2. https:\/\/github.com\/cage-challenge\/cagechallenge-2."},{"key":"e_1_3_2_2_11_1","first-page":"5829","article-title":"ExplorationGuided Reward Shaping for Reinforcement Learning under Sparse Rewards","volume":"35","author":"Devidze Rati","year":"2022","unstructured":"Rati Devidze, Parameswaran Kamalaruban, and Adish Singla. 2022. ExplorationGuided Reward Shaping for Reinforcement Learning under Sparse Rewards. Advances in Neural Information Processing Systems 35 (2022), 5829--5842.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488932.3527286"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"crossref","unstructured":"Myles Foley Mia Wang Zoe M Chris Hicks and Vasilios Mavroudis. 2023. Inroads into Autonomous Network Defence using Explained Reinforcement Learning. arXiv:2306.09318 [cs.CR]","DOI":"10.1145\/3488932.3527286"},{"key":"e_1_3_2_2_14_1","unstructured":"TTCP CAGE Working Group. 2022. TTCP CAGE Challenge 3. https:\/\/github. com\/cage-challenge\/cage-challenge-3."},{"key":"e_1_3_2_2_15_1","unstructured":"Marek Grzes. 2017. Reward shaping in episodic reinforcement learning. (2017)."},{"key":"e_1_3_2_2_16_1","first-page":"15281","article-title":"Unpacking reward shaping: Understanding the benefits of reward engineering on sample complexity","volume":"35","author":"Gupta Abhishek","year":"2022","unstructured":"Abhishek Gupta, Aldo Pacchiano, Yuexiang Zhai, Sham Kakade, and Sergey Levine. 2022. Unpacking reward shaping: Understanding the benefits of reward engineering on sample complexity. Advances in Neural Information Processing Systems 35 (2022), 15281--15295.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_17_1","volume-title":"Reinforcement Learning with Deep Energy-Based Policies. CoRR abs\/1702.08165","author":"Haarnoja Tuomas","year":"2017","unstructured":"Tuomas Haarnoja, Haoran Tang, Pieter Abbeel, and Sergey Levine. 2017. Reinforcement Learning with Deep Energy-Based Policies. CoRR abs\/1702.08165 (2017). arXiv:1702.08165 http:\/\/arxiv.org\/abs\/1702.08165"},{"key":"e_1_3_2_2_18_1","unstructured":"Tuomas Haarnoja Aurick Zhou Pieter Abbeel and Sergey Levine. 2018. Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor. arXiv:1801.01290 [cs.LG]"},{"key":"e_1_3_2_2_19_1","unstructured":"John Hannay. 2022. Cyborg Cage 2 Solution. https:\/\/github.com\/john-cardiff\/- cyborg-cage-2."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605764.3623986"},{"key":"e_1_3_2_2_21_1","unstructured":"Ashley Hill Antonin Raffin Maximilian Ernestus Adam Gleave Anssi Kanervisto Rene Traore Prafulla Dhariwal Christopher Hesse Oleg Klimov Alex Nichol Matthias Plappert Alec Radford John Schulman Szymon Sidor and Yuhuai Wu. 2018. Stable Baselines. https:\/\/github.com\/hill-a\/stable-baselines."},{"key":"e_1_3_2_2_22_1","volume-title":"Filip De Turck, and Pieter Abbeel","author":"Houthooft Rein","year":"2016","unstructured":"Rein Houthooft, Xi Chen, Yan Duan, John Schulman, Filip De Turck, and Pieter Abbeel. 2016. Vime: Variational information maximizing exploration. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1551--6709.2011.01222.x"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913495721"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIS.2016.7550947"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11741"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"crossref","unstructured":"Pietro Mazzaglia Ozan Catal Tim Verbelen and Bart Dhoedt. 2022. CuriosityDriven Exploration via Latent Bayesian Surprise. arXiv:2104.07495 [cs.LG]","DOI":"10.1609\/aaai.v36i7.20743"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3425780"},{"key":"e_1_3_2_2_29_1","volume-title":"Mehdi Mirza, Alex Graves, Timothy P. Lillicrap, Tim Harley, David Silver, and Koray Kavukcuoglu.","author":"Mnih Volodymyr","year":"2016","unstructured":"Volodymyr Mnih, Adri\u00e0 Puigdom\u00e8nech Badia, Mehdi Mirza, Alex Graves, Timothy P. Lillicrap, Tim Harley, David Silver, and Koray Kavukcuoglu. 2016. Asynchronous Methods for Deep Reinforcement Learning. arXiv:1602.01783 [cs.LG]"},{"key":"e_1_3_2_2_30_1","unstructured":"Volodymyr Mnih Koray Kavukcuoglu David Silver Alex Graves Ioannis Antonoglou Daan Wierstra and Martin Riedmiller. 2013. Playing Atari with Deep Reinforcement Learning. arXiv:1312.5602 [cs.LG]"},{"key":"e_1_3_2_2_31_1","volume-title":"Riedmiller","author":"Mnih Volodymyr","year":"2013","unstructured":"Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, and Martin A. Riedmiller. 2013. Playing Atari with Deep Reinforcement Learning. CoRR abs\/1312.5602 (2013). arXiv:1312.5602 http:\/\/arxiv.org\/abs\/1312.5602"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"V. Mnih K. Kavukcuoglu D. Silver A A. Rusu J. Veness M G. Bellemare A. Graves M. Riedmiller A K. Fidjeland G. Ostrovski S. Petersen C. Beattie A. Sadik I. Antonoglou H. King D. Kumaran D. Wierstra S. Legg and D. Hassabis. 2015. Human-level control through deep reinforcement learning. Nature (2015).","DOI":"10.1038\/nature14236"},{"key":"e_1_3_2_2_33_1","unstructured":"Andres Molina-Markham Cory Miniter Becky Powell and Ahmad Ridley. 2021. Network Environment Design for Autonomous Cyberdefense. arXiv:2103.07583 [cs.CR]"},{"key":"e_1_3_2_2_34_1","volume-title":"Icml","volume":"99","author":"Ng Andrew Y","year":"1999","unstructured":"Andrew Y Ng, Daishi Harada, and Stuart Russell. 1999. Policy invariance under reward transformations: Theory and application to reward shaping. In Icml, Vol. 99. Citeseer, 278--287."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"Jakob Nyberg and Pontus Johnson. 2023. Training Automated Defense Strategies Using Graph-based Cyber Attack Simulations. arXiv:2304.11084 [cs.CR]","DOI":"10.14722\/wosoc.2023.23006"},{"key":"e_1_3_2_2_36_1","volume-title":"Jonathan Raiman, Tim Salimans, Jeremy Schlatter, Jonas Schneider, Szymon Sidor, Ilya Sutskever, Jie Tang, Filip Wolski, and Susan Zhang.","author":"AI","year":"2019","unstructured":"OpenAI, :, Christopher Berner, Greg Brockman, Brooke Chan, Vicki Cheung, Christy Dennison, David Farhi, Quirin Fischer, Shariq Hashme, Chris Hesse, Rafal J\u00f3zefowicz, Scott Gray, Catherine Olsson, Jakub Pachocki, Michael Petrov, Henrique P. d. O. Pinto, Jonathan Raiman, Tim Salimans, Jeremy Schlatter, Jonas Schneider, Szymon Sidor, Ilya Sutskever, Jie Tang, Filip Wolski, and Susan Zhang. 2019. Dota 2 with Large Scale Deep Reinforcement Learning. arXiv:1912.06680 [cs.LG]"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.70"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2739482.2768429"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.2352\/ISSN.2470--1173.2017"},{"key":"e_1_3_2_2_40_1","volume-title":"Galaxy: A Network Emulation Framework for Cybersecurity. In 11th USENIX Workshop on Cyber Security Experimentation and Test (CSET 18)","author":"Schoonover Kevin","year":"2018","unstructured":"Kevin Schoonover, Eric Michalak, Sean Harris, Adam Gausmann, Hannah Reinbolt, Daniel R. Tauritz, Chris Rawlings, and Aaron Scott Pope. 2018. Galaxy: A Network Emulation Framework for Cybersecurity. In 11th USENIX Workshop on Cyber Security Experimentation and Test (CSET 18). USENIX Association, Baltimore, MD. https:\/\/www.usenix.org\/conference\/cset18\/presentation\/schoonover"},{"key":"e_1_3_2_2_41_1","volume-title":"International conference on machine learning. PMLR","author":"Schulman John","year":"2015","unstructured":"John Schulman, Sergey Levine, Pieter Abbeel, Michael Jordan, and Philipp Moritz. 2015. Trust region policy optimization. In International conference on machine learning. PMLR, 1889--1897."},{"key":"e_1_3_2_2_42_1","unstructured":"J. Schulman F. Wolski P. Dhariwal A. Radford and O. Klimov. 2017. Proximal Policy Optimization Algorithms. In arXiv:1707.06347 [cs]."},{"key":"e_1_3_2_2_43_1","volume-title":"Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347","author":"Schulman John","year":"2017","unstructured":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. 2017. Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)."},{"key":"e_1_3_2_2_44_1","volume-title":"Year =","author":"Schwartz Jonathon","year":"2020","unstructured":"Jonathon Schwartz. [n. d.]. Network Attack Simulator. https:\/\/github.com\/ Jjschwartz\/NetworkAttackSimulator, Year = 2020."},{"key":"e_1_3_2_2_45_1","unstructured":"Jonathon Schwartz and Hanna Kurniawati. 2019. Autonomous Penetration Testing using Reinforcement Learning. arXiv:1905.05965 [cs.CR]"},{"key":"e_1_3_2_2_46_1","volume-title":"Proceedings of the 31st Annual Meeting of the Cognitive Science Society.","author":"Singh Satinder","unstructured":"Satinder Singh, R. Lewis, and A. Barto. 2009. Where Do Rewards Come From? Proceedings of the 31st Annual Meeting of the Cognitive Science Society."},{"key":"e_1_3_2_2_47_1","unstructured":"Dr. Surjit. 2020. Deep reinforcement learning using proximal policy optimization. https:\/\/medium.com\/analytics-vidhya\/deep-reinforcement-learning-usingproximal-policy-optimization-7555280ef941"},{"key":"e_1_3_2_2_48_1","volume-title":"Reinforcement learning: An introduction","author":"Sutton Richard S","unstructured":"Richard S Sutton and Andrew G Barto. 2018. Reinforcement learning: An introduction. MIT press."},{"key":"e_1_3_2_2_49_1","volume-title":"Reinforcement learning: An introduction","author":"Sutton Richard S","unstructured":"Richard S Sutton and Andrew G Barto. 2018. Reinforcement learning: An introduction. MIT press."},{"key":"e_1_3_2_2_50_1","volume-title":"Yan Duan, John Schulman, Filip DeTurck, and Pieter Abbeel.","author":"Tang Haoran","year":"2017","unstructured":"Haoran Tang, Rein Houthooft, Davis Foote, Adam Stooke, OpenAI Xi Chen, Yan Duan, John Schulman, Filip DeTurck, and Pieter Abbeel. 2017. # exploration: A study of count-based exploration for deep reinforcement learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_51_1","unstructured":"Microsoft Defender Research Team. 2021. CyberBattleSim. https:\/\/github.com\/ microsoft\/cyberbattlesim. Created by Christian Seifert Michael Betser William Blum James Bono Kate Farris Emily Goren Justin Grana Kristian Holsheimer Brandon Marken Joshua Neil Nicole Nichols Jugal Parikh Haoran Wei.."},{"key":"e_1_3_2_2_52_1","volume-title":"Keeping your distance: Solving sparse reward tasks using self-balancing shaped rewards. Advances in Neural Information Processing Systems 32","author":"Trott Alexander","year":"2019","unstructured":"Alexander Trott, Stephan Zheng, Caiming Xiong, and Richard Socher. 2019. Keeping your distance: Solving sparse reward tasks using self-balancing shaped rewards. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1190"},{"key":"e_1_3_2_2_54_1","unstructured":"Melody Wolk Andy Applebaum Camron Dennler Patrick Dwyer Marina Moskowitz Harold Nguyen Nicole Nichols Nicole Park Paul Rachwalski Frank Rau and Adrian Webster. 2022. Beyond CAGE: Investigating Generalization of Learned Autonomous Network Defense Policies. arXiv:2211.15557 [cs.LG]"},{"key":"e_1_3_2_2_55_1","unstructured":"Yizhou Yang and Xin Liu. 2022. Behaviour-Diverse Automatic Penetration Testing: A Curiosity-Driven Multi-Objective Deep Reinforcement Learning Approach. arXiv:2202.10630 [cs.LG]"},{"key":"e_1_3_2_2_56_1","first-page":"24611","article-title":"The surprising effectiveness of ppo in cooperative multi-agent games","volume":"35","author":"Yu Chao","year":"2022","unstructured":"Chao Yu, Akash Velu, Eugene Vinitsky, Jiaxuan Gao, Yu Wang, Alexandre Bayen, and Yi Wu. 2022. The surprising effectiveness of ppo in cooperative multi-agent games. Advances in Neural Information Processing Systems 35 (2022), 24611--24624.","journal-title":"Advances in Neural Information Processing Systems"}],"event":{"name":"CCS '23: ACM SIGSAC Conference on Computer and Communications Security","location":"Copenhagen Denmark","acronym":"CCS '23","sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"]},"container-title":["Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605764.3623916","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3605764.3623916","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T01:36:12Z","timestamp":1755912972000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605764.3623916"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,26]]},"references-count":56,"alternative-id":["10.1145\/3605764.3623916","10.1145\/3605764"],"URL":"https:\/\/doi.org\/10.1145\/3605764.3623916","relation":{},"subject":[],"published":{"date-parts":[[2023,11,26]]},"assertion":[{"value":"2023-11-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}