{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,3]],"date-time":"2026-07-03T16:40:51Z","timestamp":1783096851137,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":15,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T00:00:00Z","timestamp":1749600000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,25]]},"DOI":"10.1145\/3713081.3731733","type":"proceedings-article","created":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T17:20:36Z","timestamp":1749230436000},"page":"31-35","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["ASTRAL: A Tool for the Automated Safety Testing of Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0395-5131","authenticated-orcid":false,"given":"Miriam","family":"Ugarte","sequence":"first","affiliation":[{"name":"Mondragon University, Mondragon, Spain"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0588-316X","authenticated-orcid":false,"given":"Pablo","family":"Valle","sequence":"additional","affiliation":[{"name":"Mondragon University, Mondragon, Spain"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4708-4606","authenticated-orcid":false,"given":"Jos\u00e9 Antonio","family":"Parejo","sequence":"additional","affiliation":[{"name":"University of Seville, Seville, Spain"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8816-6213","authenticated-orcid":false,"given":"Sergio","family":"Segura","sequence":"additional","affiliation":[{"name":"University of Seville, Seville, Spain"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7507-5080","authenticated-orcid":false,"given":"Aitor","family":"Arrieta","sequence":"additional","affiliation":[{"name":"Mondragon University, Mondragon, Spain"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,6,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. European Commission AI Act. https:\/\/digital-strategy.ec.europa.eu\/en\/policies\/regulatory-framework-ai. [Online]."},{"key":"e_1_3_2_1_2_1","volume-title":"Jos\u00e9 Antonio Parejo, and Sergio Segura","author":"Arrieta Aitor","year":"2025","unstructured":"Aitor Arrieta, Miriam Ugarte, Pablo Valle, Jos\u00e9 Antonio Parejo, and Sergio Segura. 2025. Early External Safety Testing of OpenAI's o3-mini: Insights from the Pre-Deployment Evaluation. arXiv:2501.17749 [cs.SE] https:\/\/arxiv.org\/abs\/2501.17749"},{"key":"e_1_3_2_1_3_1","volume-title":"Jos\u00e9 Antonio Parejo, and Sergio Segura","author":"Arrieta Aitor","year":"2025","unstructured":"Aitor Arrieta, Miriam Ugarte, Pablo Valle, Jos\u00e9 Antonio Parejo, and Sergio Segura. 2025. o3-mini vs DeepSeek-R1: Which One is Safer? arXiv:2501.18438 [cs.SE] https:\/\/arxiv.org\/abs\/2501.18438"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.55662\/JST.2023.4605"},{"key":"e_1_3_2_1_5_1","unstructured":"Mianqiu Huang Xiaoran Liu Shaojun Zhou Mozhi Zhang Chenkun Tan Pengyu Wang Qipeng Guo Zhe Xu Linyang Li Zhikai Lei et al. 2024. LongSafetyBench: Long-Context LLMs Struggle with Safety Issues. arXiv preprint arXiv:2411.06899 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Beavertails: Towards improved safety alignment of LLM via a human-preference dataset. Advances in Neural Information Processing Systems 36","author":"Ji Jiaming","year":"2024","unstructured":"Jiaming Ji, Mickel Liu, Josef Dai, Xuehai Pan, Chi Zhang, Ce Bian, Boyuan Chen, Ruiyang Sun, Yizhou Wang, and Yaodong Yang. 2024. Beavertails: Towards improved safety alignment of LLM via a human-preference dataset. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_7_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020), 9459\u20139474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_8_1","volume-title":"Salad-bench: A hierarchical and comprehensive safety benchmark for large language models. arXiv preprint arXiv:2402.05044","author":"Li Lijun","year":"2024","unstructured":"Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. 2024. Salad-bench: A hierarchical and comprehensive safety benchmark for large language models. arXiv preprint arXiv:2402.05044 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Sergio Segura, and Aitor Arrieta.","author":"Ugarte Miriam","year":"2025","unstructured":"Miriam Ugarte, Pablo Valle, Jos\u00e9 Antonio Parejo, Sergio Segura, and Aitor Arrieta. 2025. Astral: Automated safety testing of large language models. arXiv preprint arXiv:2501.17132 (2025)."},{"key":"e_1_3_2_1_10_1","volume-title":"Rebecca Qian, Anand Kannappan, Scott A Hale, and Paul R\u00f6ttger.","author":"Vidgen Bertie","year":"2023","unstructured":"Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A Hale, and Paul R\u00f6ttger. 2023. Simplesafetytests: a test suite for identifying critical safety risks in large language models. arXiv preprint arXiv:2311.08370 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Kaixuan Huang, Luxi He, Boyi Wei, Dacheng Li, Ying Sheng, et al.","author":"Xie Tinghao","year":"2024","unstructured":"Tinghao Xie, Xiangyu Qi, Yi Zeng, Yangsibo Huang, Udari Madhushani Sehwag, Kaixuan Huang, Luxi He, Boyi Wei, Dacheng Li, Ying Sheng, et al. 2024. Sorry-bench: Systematically evaluating large language model safety refusal behaviors. arXiv preprint arXiv:2406.14598 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"SafeBench: A Safety Evaluation Framework for Multimodal Large Language Models. arXiv preprint arXiv:2410.18927","author":"Ying Zonghao","year":"2024","unstructured":"Zonghao Ying, Aishan Liu, Siyuan Liang, Lei Huang, Jinyang Guo, Wenbo Zhou, Xianglong Liu, and Dacheng Tao. 2024. SafeBench: A Safety Evaluation Framework for Multimodal Large Language Models. arXiv preprint arXiv:2410.18927 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"S-Eval: Automatic and Adaptive Test Generation for Benchmarking Safety Evaluation of Large Language Models. arXiv preprint arXiv:2405.14191","author":"Yuan Xiaohan","year":"2024","unstructured":"Xiaohan Yuan, Jinfeng Li, Dongxia Wang, Yuefeng Chen, Xiaofeng Mao, Longtao Huang, Hui Xue, Wenhai Wang, Kui Ren, and Jingyi Wang. 2024. S-Eval: Automatic and Adaptive Test Generation for Benchmarking Safety Evaluation of Large Language Models. arXiv preprint arXiv:2405.14191 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"CHiSafetyBench: A Chinese Hierarchical Safety Benchmark for Large Language Models. arXiv preprint arXiv:2406.10311","author":"Zhang Wenjing","year":"2024","unstructured":"Wenjing Zhang, Xuejiao Lei, Zhaoxiang Liu, Meijuan An, Bikun Yang, KaiKai Zhao, Kai Wang, and Shiguo Lian. 2024. CHiSafetyBench: A Chinese Hierarchical Safety Benchmark for Large Language Models. arXiv preprint arXiv:2406.10311 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Safetybench: Evaluating the safety of large language models with multiple choice questions. arXiv preprint arXiv:2309.07045","author":"Zhang Zhexin","year":"2023","unstructured":"Zhexin Zhang, Leqi Lei, Lindong Wu, Rui Sun, Yongkang Huang, Chong Long, Xiao Liu, Xuanyu Lei, Jie Tang, and Minlie Huang. 2023. Safetybench: Evaluating the safety of large language models with multiple choice questions. arXiv preprint arXiv:2309.07045 (2023)."}],"event":{"name":"ISSTA Companion '25: 34th ACM SIGSOFT International Symposium on Software Testing and Analysis","location":"Clarion Hotel Trondheim Trondheim Norway","acronym":"ISSTA Companion '25","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 34th ACM SIGSOFT International Symposium on Software Testing and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3713081.3731733","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:09Z","timestamp":1750295889000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3713081.3731733"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,11]]},"references-count":15,"alternative-id":["10.1145\/3713081.3731733","10.1145\/3713081"],"URL":"https:\/\/doi.org\/10.1145\/3713081.3731733","relation":{},"subject":[],"published":{"date-parts":[[2025,6,11]]},"assertion":[{"value":"2025-06-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}