{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,24]],"date-time":"2025-06-24T04:04:29Z","timestamp":1750737869957,"version":"3.41.0"},"publisher-location":"Singapore","reference-count":55,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819665877","type":"print"},{"value":"9789819665884","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-6588-4_25","type":"book-chapter","created":{"date-parts":[[2025,6,23]],"date-time":"2025-06-23T14:40:54Z","timestamp":1750689654000},"page":"361-374","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating the Effects of Prompt Perturbation on Bias and Hallucination in Large Language Models"],"prefix":"10.1007","author":[{"given":"Mamehgol","family":"Yousefi","sequence":"first","affiliation":[]},{"given":"Ahmad","family":"Shahi","sequence":"additional","affiliation":[]},{"given":"Mos","family":"Sharifi","sequence":"additional","affiliation":[]},{"given":"Alvaro","family":"Romera","sequence":"additional","affiliation":[]},{"given":"Simon","family":"Hoermann","sequence":"additional","affiliation":[]},{"given":"Tham","family":"Piumsomboon","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,24]]},"reference":[{"key":"25_CR1","unstructured":"Achiam, J., et al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"25_CR2","unstructured":"Iyer, A., Kenthapadi, K.: Introducing fiddler auditor: evaluate the robustness of LLMs and NLP models (2023). https:\/\/www.fiddler.ai\/blog\/introducing-fiddler-auditor-evaluate-the-robustness-of-llms-and-nlp-models. Accessed 28 June 2023"},{"key":"25_CR3","unstructured":"Anil, R., et al.: Palm 2 technical report. arXiv preprint arXiv:2305.10403 (2023)"},{"key":"25_CR4","unstructured":"Anthropic: Claude 3 haiku: Our fastest model yet (2024). https:\/\/www.anthropic.com\/news\/claude-3-haiku"},{"key":"25_CR5","doi-asserted-by":"publisher","first-page":"587","DOI":"10.1162\/tacl_a_00041","volume":"6","author":"EM Bender","year":"2018","unstructured":"Bender, E.M., Friedman, B.: Data statements for natural language processing: toward mitigating system bias and enabling better science. Trans. Assoc. Comput. Linguist. 6, 587\u2013604 (2018)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"25_CR6","doi-asserted-by":"crossref","unstructured":"Bender, E.M., Gebru, T., McMillan-Major, A., Shmitchell, S.: On the dangers of stochastic parrots: can language models be too big? In: FAccT (2021)","DOI":"10.1145\/3442188.3445922"},{"key":"25_CR7","doi-asserted-by":"crossref","unstructured":"Bender, E.M., Koller, A.: Climbing towards NLU: on meaning, form, and understanding in the age of data. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 5185\u20135198 (2020)","DOI":"10.18653\/v1\/2020.acl-main.463"},{"key":"25_CR8","unstructured":"Bolukbasi, T., Chang, K.W., Zou, J., Saligrama, V., Kalai, A.: Quantifying and reducing stereotypes in word embeddings. arXiv preprint arXiv:1606.06121 (2016)"},{"key":"25_CR9","unstructured":"Borgese, M., Joyce, C., Anderson, E.E., Churpek, M.M., Afshar, M.: Bias assessment and correction in machine learning algorithms: a use-case in a natural language processing algorithm to identify hospitalized patients with unhealthy alcohol use. In: AMIA Annual Symposium Proceedings, vol.\u00a02021, p.\u00a0247. American Medical Informatics Association (2021)"},{"key":"25_CR10","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"10","key":"25_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3564269","volume":"55","author":"F Catania","year":"2023","unstructured":"Catania, F., Spitale, M., Garzotto, F.: Conversational agents in therapeutic interventions for neurodevelopmental disorders: a survey. ACM Comput. Surv. 55(10), 1\u201334 (2023)","journal-title":"ACM Comput. Surv."},{"key":"25_CR12","unstructured":"Chang, E.Y.: Uncovering biases with reflective large language models. arXiv preprint arXiv:2408.13464 (2024)"},{"issue":"3","key":"25_CR13","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3641289","volume":"15","author":"Y Chang","year":"2024","unstructured":"Chang, Y., et al.: A survey on evaluation of large language models. ACM Trans. Intell. Syst. Technol. 15(3), 1\u201345 (2024)","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"25_CR14","doi-asserted-by":"crossref","unstructured":"Clavi\u00e9, B., Ciceu, A., Naylor, F., Souli\u00e9, G., Brightwell, T.: Large language models in the workplace: a case study on prompt engineering for job type classification. In: International Conference on Applications of Natural Language to Information Systems, pp. 3\u201317. Springer (2023)","DOI":"10.1007\/978-3-031-35320-8_1"},{"key":"25_CR15","unstructured":"Conroy, R.: Sample size: a rough guide (2015). http:\/\/www.beaumontethics.ie\/docs\/application\/samplesizecalculation.pdf"},{"key":"25_CR16","unstructured":"Conroy, R.M., et al.: The RCSI sample size handbook. A rough guide, pp. 59\u201361 (2016)"},{"key":"25_CR17","doi-asserted-by":"crossref","unstructured":"Dai, S., Xu, C., Xu, S., Pang, L., Dong, Z., Xu, J.: Bias and unfairness in information retrieval systems: new challenges in the LLM era. In: Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp. 6437\u20136447 (2024)","DOI":"10.1145\/3637528.3671458"},{"key":"25_CR18","doi-asserted-by":"crossref","unstructured":"Dalianis, H., Dalianis, H.: Evaluation metrics and evaluation. In: Clinical Text Mining: Secondary Use of Electronic Patient Records, pp. 45\u201353 (2018)","DOI":"10.1007\/978-3-319-78503-5_6"},{"key":"25_CR19","unstructured":"Davis, E., Marcus, G.: Commonsense reasoning and commonsense knowledge in artificial intelligence AI has seen great advances of many kinds recently, but there is one critical area where progress has been extremely slow: ordinary commonsense"},{"key":"25_CR20","doi-asserted-by":"crossref","unstructured":"DB, M.Y., et al.: Classification of oil palm female inflorescences anthesis stages using machine learning approaches. Inf. Process. Agric. 8(4), 537\u2013549 (2021)","DOI":"10.1016\/j.inpa.2020.11.007"},{"key":"25_CR21","doi-asserted-by":"publisher","first-page":"1066","DOI":"10.1162\/tacl_a_00506","volume":"10","author":"N Dziri","year":"2022","unstructured":"Dziri, N., Rashkin, H., Linzen, T., Reitter, D.: Evaluating attribution in dialogue systems: the begin benchmark. Trans. Assoc. Comput. Linguist. 10, 1066\u20131083 (2022)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"25_CR22","unstructured":"Gallegos, I.O., et al.: Bias and fairness in large language models: a survey. Comput. Linguist. 1\u201379 (2024)"},{"key":"25_CR23","unstructured":"Hendrycks, D., et al.: Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 (2020)"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Hepenstal, S., Zhang, L., Kodagoda, N., Wong, B.L.W.: Developing conversational agents for use in criminal investigations. ACM Trans. Interact. Intell. Syst. (TiiS) 11(3-4), 1\u201335 (2021)","DOI":"10.1145\/3444369"},{"key":"25_CR25","unstructured":"Iyer, A.: Expect the unexpected: the importance of model robustness (2023). https:\/\/www.fiddler.ai\/blog\/expect-the-unexpected-the-importance-of-model-robustness. Fiddler AI Blog"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Ji, Z., et al.: Survey of hallucination in natural language generation. ACM Comput. Surv. 55 (2023)","DOI":"10.1145\/3571730"},{"key":"25_CR27","doi-asserted-by":"crossref","unstructured":"Khurana, A., Subramonyam, H., Chilana, P.K.: Why and when LLM-based assistants can go wrong: investigating the effectiveness of prompt-based interactions for software help-seeking. In: Proceedings of the 29th International Conference on Intelligent User Interfaces, pp. 288\u2013303 (2024)","DOI":"10.1145\/3640543.3645200"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Kim, T.S., Lee, Y., Shin, J., Kim, Y.H., Kim, J.: Evallm: interactive evaluation of large language model prompts on user-defined criteria. In: Proceedings of the CHI Conference on Human Factors in Computing Systems, pp. 1\u201321 (2024)","DOI":"10.1145\/3613904.3642216"},{"key":"25_CR29","unstructured":"fiddler-labs: fiddler-auditor (2023). https:\/\/github.com\/fiddler-labs\/fiddler-auditor"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"Langevin, R., Lordon, R.J., Avrahami, T., Cowan, B.R., Hirsch, T., Hsieh, G.: Heuristic evaluation of conversational agents. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp. 1\u201315 (2021)","DOI":"10.1145\/3411764.3445312"},{"key":"25_CR31","volume":"177","author":"I Lee","year":"2020","unstructured":"Lee, I., Chan, K.Y.: Understanding farmers\u2019 adoption of precision agriculture technologies: an institutional perspective. Comput. Electron. Agric. 177, 105600 (2020)","journal-title":"Comput. Electron. Agric."},{"key":"25_CR32","unstructured":"Levene, H.: Robust tests for equality of variances. In: Contributions to Probability and Statistics, pp. 278\u2013292 (1960)"},{"key":"25_CR33","unstructured":"Li, L., Bamman, D.: Gender and representation bias in GPT-3 generated stories. In: ACL Workshop on Narrative Understanding (2021)"},{"key":"25_CR34","unstructured":"Mann, B., et al.: Language models are few-shot learners. arXiv preprint arXiv:2005.141651 (2020)"},{"key":"25_CR35","doi-asserted-by":"crossref","unstructured":"Maynez, J., Narayan, S., Bohnet, B., McDonald, R.: On faithfulness and factuality in abstractive summarization. arXiv preprint arXiv:2005.00661 (2020)","DOI":"10.18653\/v1\/2020.acl-main.173"},{"key":"25_CR36","unstructured":"Minaee, S., et al.: Large language models: a survey. arXiv preprint arXiv:2402.06196 (2024)"},{"key":"25_CR37","doi-asserted-by":"crossref","unstructured":"Munechika, D., et al.: Visual auditor: interactive visualization for detection and summarization of model biases. In: 2022 IEEE Visualization and Visual Analytics (VIS), pp. 45\u201349. IEEE (2022)","DOI":"10.1109\/VIS54862.2022.00018"},{"key":"25_CR38","volume-title":"Machine Learning: A Probabilistic Perspective (Adaptive Computation and Machine Learning Series)","author":"KP Murphy","year":"2018","unstructured":"Murphy, K.P.: Machine Learning: A Probabilistic Perspective (Adaptive Computation and Machine Learning Series). The MIT Press, London (2018)"},{"key":"25_CR39","unstructured":"OpenAI: Hello GPT-4o (2024). https:\/\/openai.com\/index\/hello-gpt-4o\/"},{"key":"25_CR40","unstructured":"Paka, A., Gade, K., Kenthapadi, K.: The missing link in generative AI. Fiddler AI Blog (2023)"},{"key":"25_CR41","unstructured":"Reagan, M.: Not all rainbows and sunshine: the darker side of chatgpt. Towards Data Science (2023)"},{"key":"25_CR42","doi-asserted-by":"crossref","unstructured":"Ribeiro, M.T., Wu, T., Guestrin, C., Singh, S.: Beyond accuracy: behavioral testing of NLP models with checklist. arXiv preprint arXiv:2005.04118 (2020)","DOI":"10.18653\/v1\/2020.acl-main.442"},{"key":"25_CR43","unstructured":"Sahoo, N.R., Saxena, A., Maharaj, K., Ahmad, A.A., Mishra, A., Bhattacharyya, P.: Addressing bias and hallucination in large language models. In: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries, pp. 73\u201379 (2024)"},{"key":"25_CR44","unstructured":"Saleiro, P., et al.: Aequitas: a bias and fairness audit toolkit. arXiv preprint arXiv:1811.05577 (2018)"},{"key":"25_CR45","doi-asserted-by":"crossref","unstructured":"Shahi, A., Deng, J.D., Woodford, B.J.: A streaming ensemble classifier with multi-class imbalance learning for activity recognition. In: 2017 International Joint Conference on Neural Networks (IJCNN), pp. 3983\u20133990. IEEE (2017)","DOI":"10.1109\/IJCNN.2017.7966358"},{"key":"25_CR46","doi-asserted-by":"crossref","unstructured":"Shen, H., et al.: Human-AI interactive and continuous sensemaking: a case study of image classification using scribble attention maps. In: Extended Abstracts of the 2021 CHI Conference on Human Factors in Computing Systems, pp.\u00a01\u20138 (2021)","DOI":"10.1145\/3411763.3451798"},{"key":"25_CR47","doi-asserted-by":"crossref","unstructured":"Sheng, E., Chang, K.W., Natarajan, P., Peng, N.: Societal biases in language generation: progress and challenges. arXiv preprint arXiv:2105.04054 (2021)","DOI":"10.18653\/v1\/2021.acl-long.330"},{"key":"25_CR48","doi-asserted-by":"crossref","unstructured":"Simmons, G.: Moral mimicry: large language models produce moral rationalizations tailored to political identity. arXiv preprint arXiv:2209.12106 (2022)","DOI":"10.18653\/v1\/2023.acl-srw.40"},{"key":"25_CR49","unstructured":"Srivastava, A., et al.: Beyond the imitation game: quantifying and extrapolating the capabilities of language models. arXiv preprint arXiv:2206.04615 (2022)"},{"key":"25_CR50","doi-asserted-by":"crossref","unstructured":"Suzgun, M., et al.: Challenging big-bench tasks and whether chain-of-thought can solve them. arXiv preprint arXiv:2210.09261 (2022)","DOI":"10.18653\/v1\/2023.findings-acl.824"},{"key":"25_CR51","unstructured":"Wang, A., et al.: Superglue: a stickier benchmark for general-purpose language understanding systems. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"25_CR52","unstructured":"Wang, B., et al.: Adversarial glue: a multi-task benchmark for robustness evaluation of language models. arXiv preprint arXiv:2111.02840 (2021)"},{"key":"25_CR53","unstructured":"Wang, P., et al.: Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926 (2023)"},{"key":"25_CR54","unstructured":"Ye, J., Du, M., Wang, G.: Dataframe QA: a universal LLM framework on dataframe question answering without data exposure. arXiv preprint arXiv:2401.15463 (2024)"},{"key":"25_CR55","doi-asserted-by":"crossref","unstructured":"Zhao, J., Fang, M., Shi, Z., Li, Y., Chen, L., Pechenizkiy, M.: Chbias: bias evaluation and mitigation of Chinese conversational language models. arXiv preprint arXiv:2305.11262 (2023)","DOI":"10.18653\/v1\/2023.acl-long.757"}],"container-title":["Lecture Notes in Computer Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-6588-4_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,23]],"date-time":"2025-06-23T14:41:09Z","timestamp":1750689669000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-6588-4_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819665877","9789819665884"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-6588-4_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"24 June 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Auckland","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"New Zealand","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iconip2024.org","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}