{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T22:08:00Z","timestamp":1743113280889,"version":"3.40.3"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031789762"},{"type":"electronic","value":"9783031789779"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78977-9_4","type":"book-chapter","created":{"date-parts":[[2025,1,27]],"date-time":"2025-01-27T10:12:12Z","timestamp":1737972732000},"page":"52-68","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Are Large Language Models Really Bias-Free? Jailbreak Prompts for\u00a0Assessing Adversarial Robustness to\u00a0Bias Elicitation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3053-6132","authenticated-orcid":false,"given":"Riccardo","family":"Cantini","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3986-6593","authenticated-orcid":false,"given":"Giada","family":"Cosenza","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5031-1996","authenticated-orcid":false,"given":"Alessio","family":"Orsino","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1910-9236","authenticated-orcid":false,"given":"Domenico","family":"Talia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,28]]},"reference":[{"key":"4_CR1","unstructured":"Abdin, M.I., et al.: Phi-3 technical report: a highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024)"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Abid, A., Farooqi, M., Zou, J.: Persistent anti-muslim bias in large language models. In: Proceedings of AIES 2021, pp. 298\u2013306. ACM (2021)","DOI":"10.1145\/3461702.3462624"},{"key":"4_CR3","unstructured":"Anil, R., Borgeaud, S., Wu, Y., Alayrac, J., Yu, J., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"4_CR4","unstructured":"Bellagente, M., et al.: Stable LM 2 1.6B technical report. arXiv preprint arXiv:2402.17834 (2024)"},{"key":"4_CR5","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. In: Proceedings of NeurIPS 2020 (2020)"},{"key":"4_CR6","unstructured":"Bubeck, S., et al.: Sparks of artificial general intelligence: early experiments with GPT-4. arXiv preprint arXiv:2303.12712 (2023)"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Cantini, R., Cosentino, C., Kilanioti, I., Marozzo, F., Talia, D.: Unmasking covid-19 false information on twitter: a topic-based approach with bert. In: Discovery Science, vol. 14276, pp. 126\u2013140. Springer, Cham (2023)","DOI":"10.1007\/978-3-031-45275-8_9"},{"key":"4_CR8","doi-asserted-by":"crossref","unstructured":"Chang, Y., et al.: A survey on evaluation of large language models. ACM Trans. Intell. Syst. Technol. 15(3), 39:1\u201339:45 (2024)","DOI":"10.1145\/3641289"},{"key":"4_CR9","unstructured":"Chao, P., Robey, A., Dobriban, E., Hassani, H., Pappas, G.J., Wong, E.: Jailbreaking black box large language models in twenty queries. arXiv preprint arXiv:2310.08419 (2023)"},{"key":"4_CR10","doi-asserted-by":"crossref","unstructured":"Dhamala, J., et al.: Bold: dataset and metrics for measuring biases in open-ended language generation. In: Proceedings of FAccT 2021, pp. 862\u2013872. ACM (2021)","DOI":"10.1145\/3442188.3445924"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Ferrara, E.: Should chatgpt be biased? Challenges and risks of bias in large language models. First Monday 28(11) (2023)","DOI":"10.5210\/fm.v28i11.13346"},{"key":"4_CR12","doi-asserted-by":"crossref","unstructured":"Gallegos, I.O., Rossi, R.A., Barrow, J., Tanjim, M.M., Kim, S., et al.: Bias and fairness in large language models: a survey. Comput. Linguist. (2024)","DOI":"10.1162\/coli_a_00524"},{"key":"4_CR13","unstructured":"Gupta, V., Venkit, P.N., Lauren\u00e7on, H., Wilson, S., Passonneau, R.J.: Calm: a multi-task benchmark for comprehensive assessment of language model bias. arXiv preprint arXiv:2308.12539 (2023)"},{"key":"4_CR14","doi-asserted-by":"crossref","unstructured":"Hong, J., Lee, N., Thorne, J.: Reference-free monolithic preference optimization with odds ratio. arXiv preprint arXiv:2403.07691 (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.626"},{"key":"4_CR15","unstructured":"Jiang, A.Q., et al.: Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)"},{"key":"4_CR16","unstructured":"Jin, H., Chen, R., Zhou, A., Chen, J., Zhang, Y., Wang, H.: Guard: role-playing to generate natural-language jailbreakings to test guideline adherence of large language models. arXiv preprint arXiv:2402.03299 (2024)"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"Koo, R., Lee, M., Raheja, V., Park, J.I., Kim, Z.M., Kang, D.: Benchmarking cognitive biases in large language models as evaluators. In: Findings of ACL 2024, pp. 517\u2013545. ACL (2024)","DOI":"10.18653\/v1\/2024.findings-acl.29"},{"key":"4_CR18","doi-asserted-by":"crossref","unstructured":"Lapid, R., Langberg, R., Sipper, M.: Open sesame! universal black box jailbreaking of large language models. arXiv preprint arXiv:2309.01446 (2023)","DOI":"10.3390\/app14167150"},{"key":"4_CR19","unstructured":"Liu, X., Xu, N., Chen, M., Xiao, C.: Autodan: generating stealthy jailbreak prompts on aligned large language models. In: Proceedings of ICLR 2024 (2024)"},{"key":"4_CR20","unstructured":"Lum, K., Anthis, J.R., Nagpal, C., D\u2019Amour, A.: Bias in language models: beyond trick tests and toward ruted evaluation. arXiv preprint arXiv:2402.12649 (2024)"},{"key":"4_CR21","unstructured":"Manerba, M.M., Stanczak, K., Guidotti, R., Augenstein, I.: Social bias probing: fairness benchmarking for language models. arXiv preprint arXiv:2311.09090 (2023)"},{"key":"4_CR22","unstructured":"Mehrotra, A., Zampetakis, M., Kassianik, P., Nelson, B., Anderson, H.S., et\u00a0al.: Tree of attacks: jailbreaking black-box LLMs automatically. arXiv preprint arXiv:2312.02119 (2023)"},{"key":"4_CR23","unstructured":"Mesnard, T., Hardin, C., Dadashi, R., Bhupatiraju, S., Pathak, S., et\u00a0al.: Gemma: open models based on gemini research and technology. arXiv preprint arXiv:2403.08295 (2024)"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Nadeem, M., Bethke, A., Reddy, S.: StereoSet: measuring stereotypical bias in pretrained language models. In: Proceedings of ACL-IJCNLP 2021, pp. 5356\u20135371. ACL (2021)","DOI":"10.18653\/v1\/2021.acl-long.416"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Navigli, R., Conia, S., Ross, B.: Biases in large language models: origins, inventory, and discussion. ACM J. Data Inf. Qual. 15(2) (2023)","DOI":"10.1145\/3597307"},{"key":"4_CR26","unstructured":"Rafailov, R., Sharma, A., Mitchell, E., Manning, C.D., Ermon, S., Finn, C.: Direct preference optimization: your language model is secretly a reward model. In: Proceedings of NeurIPS 2023 (2023)"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Ranathunga, S., Lee, E.A., Skenduli, M.P., Shekhar, R., Alam, M., et\u00a0al.: Neural machine translation for low-resource languages: a survey. ACM Comput. Surv. 55(11), 229:1\u2013229:37 (2023)","DOI":"10.1145\/3567592"},{"key":"4_CR28","doi-asserted-by":"publisher","first-page":"1408","DOI":"10.1162\/tacl_a_00434","volume":"9","author":"T Schick","year":"2021","unstructured":"Schick, T., Udupa, S., Sch\u00fctze, H.: Self-diagnosis and self-debiasing: a proposal for reducing corpus-based bias in NLP. Trans. Assoc. Comput. Linguist. 9, 1408\u20131424 (2021)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Sheng, E., Chang, K., Natarajan, P., Peng, N.: The woman worked as a babysitter: on biases in language generation. In: Proceedings of EMNLP-IJCNLP 2019, pp. 3405\u20133410. ACL (2019)","DOI":"10.18653\/v1\/D19-1339"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Sun, T., Gaut, A., Tang, S., Huang, Y., et\u00a0al.: Mitigating gender bias in natural language processing: literature review. In: Proceedings of ACL 2019, pp. 1630\u20131640. ACL (2019)","DOI":"10.18653\/v1\/P19-1159"},{"key":"4_CR31","unstructured":"Tedeschi, S., et al.: Alert: a comprehensive benchmark for assessing large language models\u2019 safety through red teaming. arXiv preprint arXiv:2404.08676 (2024)"},{"key":"4_CR32","unstructured":"Wang, B., et al.: Adversarial glue: a multi-task benchmark for robustness evaluation of language models. In: Proceedings of NeurIPS Datasets and Benchmarks 2021 (2021)"},{"issue":"1","key":"4_CR33","first-page":"48","volume":"47","author":"J Wang","year":"2024","unstructured":"Wang, J., et al.: On the robustness of chatgpt: an adversarial and out-of-distribution perspective. IEEE Data Eng. Bull. 47(1), 48\u201362 (2024)","journal-title":"IEEE Data Eng. Bull."},{"key":"4_CR34","unstructured":"Weidinger, L., Mellor, J., Rauh, M., Griffin, C., Uesato, J., et\u00a0al.: Ethical and social risks of harm from language models. arXiv preprint arXiv:2112.04359 (2021)"},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Zhang, B.H., Lemoine, B., Mitchell, M.: Mitigating unwanted biases with adversarial learning. In: Proceedings of AIES 2018, pp. 335\u2013340. ACM (2018)","DOI":"10.1145\/3278721.3278779"},{"key":"4_CR36","doi-asserted-by":"crossref","unstructured":"Zmigrod, R., Mielke, S.J., Wallach, H., Cotterell, R.: Counterfactual data augmentation for mitigating gender stereotypes in languages with rich morphology. In: Proceedings of ACL 2019, pp. 1651\u20131661. ACL (2019)","DOI":"10.18653\/v1\/P19-1161"}],"container-title":["Lecture Notes in Computer Science","Discovery Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78977-9_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,27]],"date-time":"2025-01-27T10:12:41Z","timestamp":1737972761000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78977-9_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031789762","9783031789779"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78977-9_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"28 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Discovery Science","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pisa","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dis2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/ds2024.isti.cnr.it\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}