{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:06:20Z","timestamp":1775815580283,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":205,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T00:00:00Z","timestamp":1724457600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Natural ScienceFoundation of China (NSFC)","award":["62377044"],"award-info":[{"award-number":["62377044"]}]},{"name":"Youth Innova-tion Promotion Association CAS","award":["2023111"],"award-info":[{"award-number":["2023111"]}]},{"name":"National Key R&D Program of China","award":["2023YFA1008704"],"award-info":[{"award-number":["2023YFA1008704"]}]},{"name":"National Natural ScienceFoundation of China (NSFC)","award":["62276248"],"award-info":[{"award-number":["62276248"]}]},{"name":"National Natural ScienceFoundation of China (NSFC)","award":["U21B2046"],"award-info":[{"award-number":["U21B2046"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,25]]},"DOI":"10.1145\/3637528.3671458","type":"proceedings-article","created":{"date-parts":[[2024,8,25]],"date-time":"2024-08-25T04:54:55Z","timestamp":1724561695000},"page":"6437-6447","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":89,"title":["Bias and Unfairness in Information Retrieval Systems: New Challenges in the LLM Era"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7549-0860","authenticated-orcid":false,"given":"Sunhao","family":"Dai","sequence":"first","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3070-9358","authenticated-orcid":false,"given":"Chen","family":"Xu","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7157-3410","authenticated-orcid":false,"given":"Shicheng","family":"Xu","sequence":"additional","affiliation":[{"name":"CAS Key Laboratory of AI Safety, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1161-8546","authenticated-orcid":false,"given":"Liang","family":"Pang","sequence":"additional","affiliation":[{"name":"CAS Key Laboratory of AI Safety, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2231-4663","authenticated-orcid":false,"given":"Zhenhua","family":"Dong","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7170-111X","authenticated-orcid":false,"given":"Jun","family":"Xu","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,24]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Abdollahpouri et al. 2020. Multistakeholder recommendation: Survey and research directions. User Modeling and User-Adapted Interaction (2020).","DOI":"10.1007\/s11257-019-09256-1"},{"key":"e_1_3_2_1_2_1","volume-title":"The unfairness of popularity bias in recommendation. arXiv","author":"Abdollahpouri Himan","year":"2019","unstructured":"Himan Abdollahpouri, Masoud Mansoury, Robin Burke, and Bamshad Mobasher. 2019. The unfairness of popularity bias in recommendation. arXiv (2019)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Abubakar Abid Maheen Farooqi and James Zou. 2021. Persistent anti-muslim bias in large language models. In AAAI.","DOI":"10.1145\/3461702.3462624"},{"key":"e_1_3_2_1_4_1","unstructured":"Qingyao Ai Ting Bai et al. 2023. Information Retrieval Meets Large Language Models: A Strategic Report from Chinese IR Community. AI Open (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Ekin Aky\u00fcrek Tolga Bolukbasi et al. 2022. Towards tracing factual knowledge in language models back to the training data. arXiv (2022).","DOI":"10.18653\/v1\/2022.findings-emnlp.180"},{"key":"e_1_3_2_1_6_1","unstructured":"Yuntao Bai Saurav Kadavath et al. 2022. Constitutional ai: Harmlessness from ai feedback. arXiv (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Longalign: A recipe for long context alignment of large language models. arXiv","author":"Bai Yushi","year":"2024","unstructured":"Yushi Bai, Xin Lv, Jiajie Zhang, Yuze He, et al. 2024. Longalign: A recipe for long context alignment of large language models. arXiv (2024)."},{"key":"e_1_3_2_1_8_1","unstructured":"Yanhong Bai Jiabao Zhao Jinxin Shi Tingjiang Wei Xingjiao Wu and Liang He. 2023. FairMonitor: A Four-Stage Automatic Framework for Detecting Stereotypes and Biases in Large Language Models. arxiv: 2308.10397 [cs.CL]"},{"key":"e_1_3_2_1_9_1","unstructured":"Keqin Bao Jizhi Zhang Wenjie Wang et al. 2023. A bi-step grounding paradigm for large language models in recommendation systems. arXiv (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Emily M Bender Timnit Gebru et al. 2021. On the dangers of stochastic parrots: Can language models be too big?. In FAccT.","DOI":"10.1145\/3442188.3445922"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Camiel J Beukeboom et al. 2019. How stereotypes are shared through language: a review and introduction of the aocial categories and stereotypes communication (SCSC) framework. Review of Communication Research (2019).","DOI":"10.12840\/issn.2255-4165.017"},{"key":"e_1_3_2_1_12_1","volume-title":"Identifying and reducing gender bias in word-level language models. arXiv","author":"Bordia Shikha","year":"2019","unstructured":"Shikha Bordia and Samuel R Bowman. 2019. Identifying and reducing gender bias in word-level language models. arXiv (2019)."},{"key":"e_1_3_2_1_13_1","unstructured":"Yihan Cao Siyu Li et al. 2023. A comprehensive survey of ai-generated content (aigc): A history of generative ai from gan to chatgpt. arXiv (2023)."},{"key":"e_1_3_2_1_14_1","unstructured":"Yupeng Chang Xu Wang et al. 2023. A survey on evaluation of large language models. TIST (2023)."},{"key":"e_1_3_2_1_15_1","unstructured":"Guiming Hardy Chen Shunian Chen et al. 2024. Humans or LLMs as the Judge? A Study on Judgement Biases. arXiv (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Jiawei Chen Hande Dong et al. 2023. Bias and debias in recommender system: A survey and future directions. TOIS (2023).","DOI":"10.1145\/3564284"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Jifan Chen Grace Kim et al. 2023. Complex Claim Verification with Evidence Retrieved in the Wild. arXiv (2023).","DOI":"10.18653\/v1\/2024.naacl-long.196"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Xiaoyang Chen Ben He et al. 2024. Spiral of Silences: How is Large Language Model Killing Information Retrieval?--A Case Study on Open Domain Question Answering. acl (2024).","DOI":"10.18653\/v1\/2024.acl-long.798"},{"key":"e_1_3_2_1_19_1","volume-title":"Longlora: Efficient fine-tuning of long-context large language models. arXiv","author":"Chen Yukang","year":"2023","unstructured":"Yukang Chen, Shengju Qian, et al. 2023. Longlora: Efficient fine-tuning of long-context large language models. arXiv (2023)."},{"key":"e_1_3_2_1_20_1","unstructured":"I Chern Steffi Chern et al. 2023. FacTool: Factuality Detection in Generative AI--A Tool Augmented Framework for Multi-Task and Multi-Domain Scenarios. arXiv (2023)."},{"key":"e_1_3_2_1_21_1","unstructured":"Steffi Chern Ethan Chern et al. 2024. Can Large Language Models be Trusted for Evaluation? Scalable Meta-Evaluation of LLMs as Evaluators via Agent Debate. arXiv (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Can Large Language Models Be an Alternative to Human Evaluations? ACL","author":"Chiang Cheng-Han","year":"2023","unstructured":"Cheng-Han Chiang and Hung-yi Lee. 2023. Can Large Language Models Be an Alternative to Human Evaluations? ACL (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"PRE: A Peer Review Based Large Language Model Evaluator. arXiv","author":"Chu Zhumin","year":"2024","unstructured":"Zhumin Chu, Qingyao Ai, Yiteng Tu, Haitao Li, and Yiqun Liu. 2024. PRE: A Peer Review Based Large Language Model Evaluator. arXiv (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Dola: Decoding by contrasting layers improves factuality in large language models. arXiv","author":"Chuang Yung-Sung","year":"2023","unstructured":"Yung-Sung Chuang, Yujia Xie, et al. 2023. Dola: Decoding by contrasting layers improves factuality in large language models. arXiv (2023)."},{"key":"e_1_3_2_1_25_1","unstructured":"John Joon Young Chung et al. 2023. Increasing diversity while maintaining accuracy: Text data generation with large language models and human interventions. arXiv (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Cocktail: A Comprehensive Information Retrieval Benchmark with LLM-Generated Documents Integration. Findings of ACL","author":"Dai Sunhao","year":"2024","unstructured":"Sunhao Dai, Weihao Liu, et al. 2024. Cocktail: A Comprehensive Information Retrieval Benchmark with LLM-Generated Documents Integration. Findings of ACL (2024)."},{"key":"e_1_3_2_1_27_1","unstructured":"Sunhao Dai Ninglu Shao et al. 2023. Uncovering ChatGPT's Capabilities in Recommender Systems. In RecSys."},{"key":"e_1_3_2_1_28_1","unstructured":"Sunhao Dai Yuqi Zhou et al. 2024. Neural Retrievers are Biased Towards LLM-Generated Content. KDD (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Karin De Langis, et al","author":"Das Debarati","year":"2024","unstructured":"Debarati Das, Karin De Langis, et al. 2024. Under the Surface: Tracking the Artifactuality of LLM-Generated Data. arXiv (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Michela Del Vicario Gianna Vivaldo et al. 2016. Echo chambers: Emotional contagion and group polarization on facebook. Scientific reports (2016).","DOI":"10.1038\/srep37825"},{"key":"e_1_3_2_1_31_1","volume-title":"Temporal Stability, and Recency. arXiv","author":"Deldjoo Yashar","year":"2024","unstructured":"Yashar Deldjoo. 2024. Understanding Biases in ChatGPT-based Recommender Systems: Provider Fairness, Temporal Stability, and Recency. arXiv (2024)."},{"key":"e_1_3_2_1_32_1","unstructured":"Yashar Deldjoo and Tommaso di Noia. 2024. CFaiRLLM: Consumer Fairness Evaluation in Large-Language Model Recommender System."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Lucas Dixon John Li et al. 2018. Measuring and mitigating unintended bias in text classification. In AAAI.","DOI":"10.1145\/3278721.3278729"},{"key":"e_1_3_2_1_34_1","volume-title":"FEQA: A question answering evaluation framework for faithfulness assessment in abstractive summarization. arXiv","author":"Durmus Esin","year":"2020","unstructured":"Esin Durmus, He He, and Mona Diab. 2020. FEQA: A question answering evaluation framework for faithfulness assessment in abstractive summarization. arXiv (2020)."},{"key":"e_1_3_2_1_35_1","volume-title":"Evaluating groundedness in dialogue systems: The begin benchmark. arXiv","author":"Dziri Nouha","year":"2021","unstructured":"Nouha Dziri, Hannah Rashkin, Tal Linzen, and David Reitter. 2021. Evaluating groundedness in dialogue systems: The begin benchmark. arXiv (2021)."},{"key":"e_1_3_2_1_36_1","volume-title":"Gender stereotypes. Annual review of psychology","author":"Ellemers Naomi","year":"2018","unstructured":"Naomi Ellemers. 2018. Gender stereotypes. Annual review of psychology (2018)."},{"key":"e_1_3_2_1_37_1","unstructured":"Wenqi Fan Zihuai Zhao et al. 2023. Recommender systems in the era of large language models (llms). arXiv (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Bias of AI-generated content: an examination of news produced by large language models. Scientific Reports","author":"Fang Xiao","year":"2024","unstructured":"Xiao Fang, Shangkun Che, Minjia Mao, Hongzhe Zhang, Ming Zhao, and Xiaohang Zhao. 2024. Bias of AI-generated content: an examination of news produced by large language models. Scientific Reports (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Bridging the gap: A survey on integrating (human) feedback for natural language generation. TACL","author":"Fernandes Patrick","year":"2023","unstructured":"Patrick Fernandes, Aman Madaan, and otherss. 2023. Bridging the gap: A survey on integrating (human) feedback for natural language generation. TACL (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Felix Friedrich Manuel Brack et al. 2023. Fair diffusion: Instructing text-to-image generation models on fairness. arXiv (2023).","DOI":"10.1007\/s43681-024-00531-5"},{"key":"e_1_3_2_1_41_1","volume-title":"Gptscore: Evaluate as you desire. arXiv","author":"Fu Jinlan","year":"2023","unstructured":"Jinlan Fu, See-Kiong Ng, Zhengbao Jiang, and Pengfei Liu. 2023. Gptscore: Evaluate as you desire. arXiv (2023)."},{"key":"e_1_3_2_1_42_1","unstructured":"Yao Fu Rameswar Panda et al. 2024. Data Engineering for Scaling Language Models to 128K Context. arXiv (2024)."},{"key":"e_1_3_2_1_43_1","unstructured":"Isabel O Gallegos Ryan A Rossi et al. 2023. Bias and fairness in large language models: A survey. arXiv (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Rarr: Researching and revising what language models say, using language models. arXiv","author":"Gao Luyu","year":"2022","unstructured":"Luyu Gao, Zhuyun Dai, et al. 2022. Rarr: Researching and revising what language models say, using language models. arXiv (2022)."},{"key":"e_1_3_2_1_45_1","volume-title":"Llm-based nlg evaluation: Current status and challenges. arXiv","author":"Gao Mingqi","year":"2024","unstructured":"Mingqi Gao, Xinyu Hu, Jie Ruan, Xiao Pu, and Xiaojun Wan. 2024. Llm-based nlg evaluation: Current status and challenges. arXiv (2024)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Aparna Garimella Akhash Amarnath et al. 2021. He is very intelligent she is very beautiful? on mitigating social biases in language modelling and generation. In ACL Findings.","DOI":"10.18653\/v1\/2021.findings-acl.397"},{"key":"e_1_3_2_1_47_1","volume-title":"Gender-tuning: Empowering fine-tuning for debiasing pre-trained language models. arXiv","author":"Ghanbarzadeh Somayeh","year":"2023","unstructured":"Somayeh Ghanbarzadeh, Yan Huang, et al. 2023. Gender-tuning: Empowering fine-tuning for debiasing pre-trained language models. arXiv (2023)."},{"key":"e_1_3_2_1_48_1","unstructured":"Friedrich M G\u00f6tz et al. 2023. Let the algorithm speak: How to use neural networks for automatic item generation in psychological scale development. Psychological Methods (2023)."},{"key":"e_1_3_2_1_49_1","unstructured":"Roger Grosse Juhan Bae et al. 2023. Studying large language model generalization with influence functions. arXiv (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Nigel Guenole Andrew Samo et al. 2024. Pseudo-Discrimination Parameters from Language Embeddings. (2024).","DOI":"10.31234\/osf.io\/9a4qx"},{"key":"e_1_3_2_1_51_1","unstructured":"Suriya Gunasekar Yi Zhang et al. 2023. Textbooks Are All You Need. arXiv (2023)."},{"key":"e_1_3_2_1_52_1","unstructured":"Izzeddin Gur Hiroki Furuta et al. 2023. A real-world webagent with planning long context understanding and program synthesis. arXiv (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"The times they are a-changing? or are they not? A comparison of gender stereotypes","author":"Haines Elizabeth L","year":"1983","unstructured":"Elizabeth L Haines, Kay Deaux, and Nicole Lofaro. 2016. The times they are a-changing? or are they not? A comparison of gender stereotypes, 1983--2014. Psychology of Women Quarterly (2016)."},{"key":"e_1_3_2_1_54_1","volume-title":"Balancing out bias: Achieving fairness through balanced training. arXiv","author":"Han Xudong","year":"2021","unstructured":"Xudong Han, Timothy Baldwin, and Trevor Cohn. 2021. Balancing out bias: Achieving fairness through balanced training. arXiv (2021)."},{"key":"e_1_3_2_1_55_1","volume-title":"Machine-Made Media: Monitoring the Mobilization of Machine-Generated Articles on Misinformation and Mainstream News Websites. arXiv","author":"Hanley Hans WA","year":"2023","unstructured":"Hans WA Hanley and Zakir Durumeric. 2023. Machine-Made Media: Monitoring the Mobilization of Machine-Generated Articles on Misinformation and Mainstream News Websites. arXiv (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"Allure: A systematic protocol for auditing and improving llm-based evaluation of text using iterative in-context-learning. arXiv","author":"Hasanbeig Hosein","year":"2023","unstructured":"Hosein Hasanbeig, Hiteshi Sharma, et al. 2023. Allure: A systematic protocol for auditing and improving llm-based evaluation of text using iterative in-context-learning. arXiv (2023)."},{"key":"e_1_3_2_1_57_1","unstructured":"Zhankui He Zhouhang Xie et al. 2023. Large language models as zero-shot conversational recommenders. In CIKM."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Yupeng Hou Junjie Zhang et al. 2024. Large Language Models are Zero-Shot Rankers for Recommender Systems. In ECIR.","DOI":"10.1007\/978-3-031-56060-6_24"},{"key":"e_1_3_2_1_59_1","unstructured":"Wenyue Hua Yingqiang Ge et al. 2023. Up5: Unbiased foundation model for fairness-aware recommendation. arXiv (2023)."},{"key":"e_1_3_2_1_60_1","unstructured":"Hui Huang Yingqi Qu et al. 2024. An Empirical Study of LLM-as-a-Judge for LLM Evaluation: Fine-tuned Judge Models are Task-specific Classifiers. arXiv (2024)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Lei Huang Weijiang Yu et al. 2023. A survey on hallucination in large language models: Principles taxonomy challenges and open questions. arXiv (2023).","DOI":"10.1145\/3703155"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Po-Sen Huang Huan Zhang et al. 2019. Reducing sentiment bias in language models via counterfactual evaluation. arXiv (2019).","DOI":"10.18653\/v1\/2020.findings-emnlp.7"},{"key":"e_1_3_2_1_63_1","unstructured":"Guangyuan Jiang Manjie Xu et al. 2024. Evaluating and inducing personality in pre-trained language models. NeurIPS (2024)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"Meng Jiang Keqin Bao et al. 2024. Item-side Fairness of Large Language Model-based Recommendation System. arXiv (2024).","DOI":"10.1145\/3589334.3648158"},{"key":"e_1_3_2_1_65_1","unstructured":"Nikhil Kandpal Haikang Deng Adam Roberts Eric Wallace and Colin Raffel. 2023. Large language models struggle to learn long-tail knowledge. In ICML."},{"key":"e_1_3_2_1_66_1","volume-title":"Impact of co-occurrence on factual knowledge of large language models. arXiv","author":"Kang Cheongwoong","year":"2023","unstructured":"Cheongwoong Kang and Jaesik Choi. 2023. Impact of co-occurrence on factual knowledge of large language models. arXiv (2023)."},{"key":"e_1_3_2_1_67_1","volume-title":"Estimating the personality of white-box language models. CoRR, abs\/2204.12000","author":"Karra SR","year":"2023","unstructured":"SR Karra, ST Nguyen, and T Tulabandhula. 2023. Estimating the personality of white-box language models. CoRR, abs\/2204.12000 (2023)."},{"key":"e_1_3_2_1_68_1","volume-title":"Shang Gao, and Pablo Arredondo.","author":"Katz Daniel Martin","year":"2024","unstructured":"Daniel Martin Katz, Michael James Bommarito, Shang Gao, and Pablo Arredondo. 2024. Gpt-4 passes the bar exam. Philosophical Transactions of the Royal Society A (2024)."},{"key":"e_1_3_2_1_69_1","unstructured":"Minbeom Kim Hwanhee Lee et al. 2022. Critic-guided decoding for controlled text generation. arXiv (2022)."},{"key":"e_1_3_2_1_70_1","volume-title":"Evallm: Interactive evaluation of large language model prompts on user-defined criteria. arXiv","author":"Kim Tae Soo","year":"2023","unstructured":"Tae Soo Kim, Yoonjoo Lee, Jamin Shin, Young-Ho Kim, and Juho Kim. 2023. Evallm: Interactive evaluation of large language model prompts on user-defined criteria. arXiv (2023)."},{"key":"e_1_3_2_1_71_1","unstructured":"Ryan Koo Minhwa Lee et al. 2023. Benchmarking cognitive biases in large language models as evaluators. arXiv (2023)."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"crossref","unstructured":"Faisal Ladhak Esin Durmus et al. 2023. When do pre-training biases propagate to downstream tasks? a case study in text summarization. In EACL.","DOI":"10.18653\/v1\/2023.eacl-main.234"},{"key":"e_1_3_2_1_73_1","volume-title":"BEA Workshop","author":"Jr Antonio Laverghetta","year":"2023","unstructured":"Antonio Laverghetta Jr and John Licato. 2023. Generating better items for cognitive assessments using large language models. In BEA Workshop 2023."},{"key":"e_1_3_2_1_74_1","unstructured":"Katherine Lee Daphne Ippolito et al. 2021. Deduplicating training data makes language models better. arXiv (2021)."},{"key":"e_1_3_2_1_75_1","unstructured":"Nayeon Lee Wei Ping et al. 2022. Factuality enhanced language models for open-ended text generation. NeurIPS (2022)."},{"key":"e_1_3_2_1_76_1","volume-title":"Halueval: A large-scale hallucination evaluation benchmark for large language models. In EMNLP.","author":"Li Junyi","year":"2023","unstructured":"Junyi Li, Xiaoxue Cheng, Xin Zhao, Jian-Yun Nie, and Ji-Rong Wen. 2023. Halueval: A large-scale hallucination evaluation benchmark for large language models. In EMNLP."},{"key":"e_1_3_2_1_77_1","volume-title":"2023 f. LooGLE: Can Long-Context Language Models Understand Long Contexts? arXiv","author":"Li Jiaqi","year":"2023","unstructured":"Jiaqi Li, Mengmeng Wang, Zilong Zheng, and Muhan Zhang. 2023 f. LooGLE: Can Long-Context Language Models Understand Long Contexts? arXiv (2023)."},{"key":"e_1_3_2_1_78_1","volume-title":"2023 h. Large language models for generative recommendation: A survey and visionary discussions. arXiv","author":"Li Lei","year":"2023","unstructured":"Lei Li, Yongfeng Zhang, Dugang Liu, and Li Chen. 2023 h. Large language models for generative recommendation: A survey and visionary discussions. arXiv (2023)."},{"key":"e_1_3_2_1_79_1","volume-title":"2023 d. Prd: Peer rank and discussion improve large language model based evaluations. arXiv","author":"Li Ruosen","year":"2023","unstructured":"Ruosen Li, Teerth Patel, and Xinya Du. 2023 d. Prd: Peer rank and discussion improve large language model based evaluations. arXiv (2023)."},{"key":"e_1_3_2_1_80_1","unstructured":"Shaobo Li Xiaoguang Li et al. 2022. How pre-trained language models capture factual knowledge? a causal-inspired analysis. arXiv (2022)."},{"key":"e_1_3_2_1_81_1","volume-title":"2023 i. Tailoring personality traits in large language models via unsupervisedly-built personalized lexicons. arXiv","author":"Li Tianlong","year":"2023","unstructured":"Tianlong Li, Xiaoqing Zheng, and Xuanjing Huang. 2023 i. Tailoring personality traits in large language models via unsupervisedly-built personalized lexicons. arXiv (2023)."},{"key":"e_1_3_2_1_82_1","volume-title":"2023 g. A preliminary study of chatgpt on news recommendation: Personalization, provider fairness, fake news. arXiv","author":"Li Xinyi","year":"2023","unstructured":"Xinyi Li, Yongfeng Zhang, and Edward C Malthouse. 2023 g. A preliminary study of chatgpt on news recommendation: Personalization, provider fairness, fake news. arXiv (2023)."},{"key":"e_1_3_2_1_83_1","unstructured":"Yunqi Li Hanxiong Chen et al. 2023. Fairness in recommendation: Foundations methods and applications. ACM Transactions on Intelligent Systems and Technology (2023)."},{"key":"e_1_3_2_1_84_1","volume-title":"A survey on fairness in large language models. arXiv","author":"Li Yingji","year":"2023","unstructured":"Yingji Li, Mengnan Du, Rui Song, Xin Wang, and Ying Wang. 2023. A survey on fairness in large language models. arXiv (2023)."},{"key":"e_1_3_2_1_85_1","volume-title":"2023 e. Split and merge: Aligning position biases in large language model based evaluators. arXiv","author":"Li Zongjie","year":"2023","unstructured":"Zongjie Li, Chaozheng Wang, et al. 2023 e. Split and merge: Aligning position biases in large language model based evaluators. arXiv (2023)."},{"key":"e_1_3_2_1_86_1","unstructured":"Zhen Li Xiaohan Xu et al. 2024. Leveraging large language models for nlg evaluation: A survey. arXiv (2024)."},{"key":"e_1_3_2_1_87_1","unstructured":"Jianghao Lin Xinyi Dai et al. 2023. How Can Recommender Systems Benefit from Large Language Models: A Survey. arXiv (2023)."},{"key":"e_1_3_2_1_88_1","volume-title":"Truthfulqa: Measuring how models mimic human falsehoods. arXiv","author":"Lin Stephanie","year":"2021","unstructured":"Stephanie Lin, Jacob Hilton, and Owain Evans. 2021. Truthfulqa: Measuring how models mimic human falsehoods. arXiv (2021)."},{"key":"e_1_3_2_1_89_1","volume-title":"Does gender matter? towards fairness in dialogue systems. arXiv","author":"Liu Haochen","year":"2019","unstructured":"Haochen Liu, Jamell Dacon, Wenqi Fan, Hui Liu, Zitao Liu, and Jiliang Tang. 2019. Does gender matter? towards fairness in dialogue systems. arXiv (2019)."},{"key":"e_1_3_2_1_90_1","unstructured":"Nelson F Liu Kevin Lin et al. 2024. Lost in the middle: How language models use long contexts. TACL (2024)."},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"crossref","unstructured":"Yang Liu Dan Iter et al. 2023. G-Eval: NLG Evaluation using Gpt-4 with Better Human Alignment. In EMNLP.","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"e_1_3_2_1_92_1","volume-title":"Nafise Sadat Moosavi, and Chenghua Lin","author":"Liu Yiqi","year":"2023","unstructured":"Yiqi Liu, Nafise Sadat Moosavi, and Chenghua Lin. 2023. Llms as narcissistic evaluators: When ego inflates evaluation scores. arXiv (2023)."},{"key":"e_1_3_2_1_93_1","unstructured":"Yang Liu Yuanshun Yao et al. 2023. Trustworthy LLMs: a Survey and Guideline for Evaluating Large Language Models' Alignment. arXiv (2023)."},{"key":"e_1_3_2_1_94_1","volume-title":"Teacher-Student Training for Debiasing: General Permutation Debiasing for Large Language Models. arXiv","author":"Liusie Adian","year":"2024","unstructured":"Adian Liusie, Yassir Fathullah, and Mark JF Gales. 2024. Teacher-Student Training for Debiasing: General Permutation Debiasing for Large Language Models. arXiv (2024)."},{"key":"e_1_3_2_1_95_1","unstructured":"Kaiji Lu Piotr Mardziel et al. 2020. Gender bias in neural natural language processing. Logic language and security: essays dedicated to Andre Scedrov on the occasion of his 65th birthday (2020)."},{"key":"e_1_3_2_1_96_1","unstructured":"Sichun Luo Bowei He et al. 2023. RecRanker: Instruction Tuning Large Language Model as Ranker for Top-k Recommendation. arXiv (2023)."},{"key":"e_1_3_2_1_97_1","volume-title":"Large Language Models are Not Stable Recommender Systems. arXiv","author":"Ma Tianhui","year":"2023","unstructured":"Tianhui Ma, Yuan Cheng, Hengshu Zhu, and Hui Xiong. 2023. Large Language Models are Not Stable Recommender Systems. arXiv (2023)."},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"crossref","unstructured":"Alex Mallen Akari Asai et al. 2022. When not to trust language models: Investigating effectiveness and limitations of parametric and non-parametric memories. arXiv (2022).","DOI":"10.18653\/v1\/2023.acl-long.546"},{"key":"e_1_3_2_1_99_1","volume-title":"An introduction to information retrieval","author":"Manning Christopher D","unstructured":"Christopher D Manning. 2009. An introduction to information retrieval. Cambridge university press."},{"key":"e_1_3_2_1_100_1","volume-title":"On faithfulness and factuality in abstractive summarization. arXiv","author":"Maynez Joshua","year":"2020","unstructured":"Joshua Maynez, Shashi Narayan, Bernd Bohnet, and Ryan McDonald. 2020. On faithfulness and factuality in abstractive summarization. arXiv (2020)."},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"crossref","unstructured":"Nick McKenna Tianyi Li et al. 2023. Sources of Hallucination by Large Language Models on Inference Tasks. arXiv (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.182"},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"crossref","unstructured":"Nicholas Meade Spandana Gella et al. 2023. Using in-context learning to improve dialogue safety. arXiv (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.796"},{"key":"e_1_3_2_1_103_1","volume-title":"A survey on bias and fairness in machine learning. ACM computing surveys (CSUR)","author":"Mehrabi Ninareh","year":"2021","unstructured":"Ninareh Mehrabi, Fred Morstatter, Nripsuta Saxena, Kristina Lerman, and Aram Galstyan. 2021. A survey on bias and fairness in machine learning. ACM computing surveys (CSUR) (2021)."},{"key":"e_1_3_2_1_104_1","volume-title":"The dark side of news community forums: Opinion manipulation trolls. Internet Research","author":"Mihaylov Todor","year":"2018","unstructured":"Todor Mihaylov, Tsvetomila Mihaylova, Preslav Nakov, Llu\u00eds M\u00e0rquez, Georgi D Georgiev, and Ivan Kolev Koychev. 2018. The dark side of news community forums: Opinion manipulation trolls. Internet Research (2018)."},{"key":"e_1_3_2_1_105_1","doi-asserted-by":"crossref","unstructured":"Sewon Min Kalpesh Krishna et al. 2023. FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation. arXiv (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.741"},{"key":"e_1_3_2_1_106_1","volume-title":"Cross-task generalization via natural language crowdsourcing instructions. arXiv","author":"Mishra Swaroop","year":"2021","unstructured":"Swaroop Mishra, Daniel Khashabi, Chitta Baral, and Hannaneh Hajishirzi. 2021. Cross-task generalization via natural language crowdsourcing instructions. arXiv (2021)."},{"key":"e_1_3_2_1_107_1","volume-title":"Webgpt: Browser-assisted question-answering with human feedback. arXiv","author":"Nakano Reiichiro","year":"2021","unstructured":"Reiichiro Nakano, Jacob Hilton, et al. 2021. Webgpt: Browser-assisted question-answering with human feedback. arXiv (2021)."},{"key":"e_1_3_2_1_108_1","unstructured":"Helen Ngo Cooper Raterink et al. 2021. Mitigating harm in language models with conditional-likelihood filtration. arXiv (2021)."},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"crossref","unstructured":"Eirini Ntoutsi Pavlos Fafalios et al. 2020. Bias in data-driven artificial intelligence systems-An introductory survey. Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery (2020).","DOI":"10.1002\/widm.1356"},{"key":"e_1_3_2_1_110_1","volume-title":"Eunsol Choi, and Greg Durrett.","author":"Onoe Yasumasa","year":"2022","unstructured":"Yasumasa Onoe, Michael JQ Zhang, Eunsol Choi, and Greg Durrett. 2022. Entity cloze by date: What LMs know about unseen entities. arXiv (2022)."},{"key":"e_1_3_2_1_111_1","volume-title":"BLIND: Bias removal with no demographics. arXiv","author":"Orgad Hadas","year":"2022","unstructured":"Hadas Orgad and Yonatan Belinkov. 2022. BLIND: Bias removal with no demographics. arXiv (2022)."},{"key":"e_1_3_2_1_112_1","unstructured":"Long Ouyang Jeffrey Wu et al. 2022. Training language models to follow instructions with human feedback. NeurIPS (2022)."},{"key":"e_1_3_2_1_113_1","volume-title":"Do llms possess a personality? making the mbti test an amazing evaluation for large language models. arXiv","author":"Pan Keyu","year":"2023","unstructured":"Keyu Pan and Yawen Zeng. 2023. Do llms possess a personality? making the mbti test an amazing evaluation for large language models. arXiv (2023)."},{"key":"e_1_3_2_1_114_1","doi-asserted-by":"crossref","unstructured":"SunYoung Park Kyuri Choi Haeun Yu and Youngjoong Ko. 2023. Never too late to learn: Regularizing gender bias in coreference resolution. In WSDM.","DOI":"10.1145\/3539597.3570473"},{"key":"e_1_3_2_1_115_1","volume-title":"Fairrec: Two-sided fairness for personalized recommendations in two-sided platforms. In WWW.","author":"Patro Gourab K","year":"2020","unstructured":"Gourab K Patro, Arpita Biswas, Niloy Ganguly, Krishna P Gummadi, and Abhijnan Chakraborty. 2020. Fairrec: Two-sided fairness for personalized recommendations in two-sided platforms. In WWW."},{"key":"e_1_3_2_1_116_1","volume-title":"Large language models sensitivity to the order of options in multiple-choice questions. arXiv","author":"Pezeshkpour Pouya","year":"2023","unstructured":"Pouya Pezeshkpour and Estevam Hruschka. 2023. Large language models sensitivity to the order of options in multiple-choice questions. arXiv (2023)."},{"key":"e_1_3_2_1_117_1","volume-title":"Generalized instruction following with pictorial prompts. Journal of Applied Behavior Analysis","author":"Phillips Cara L","year":"2012","unstructured":"Cara L Phillips and Timothy R Vollmer. 2012. Generalized instruction following with pictorial prompts. Journal of Applied Behavior Analysis (2012)."},{"key":"e_1_3_2_1_118_1","volume-title":"Fairness in rankings and recommendations: an overview. The VLDB Journal","author":"Pitoura Evaggelia","year":"2022","unstructured":"Evaggelia Pitoura, Kostas Stefanidis, and Georgia Koutrika. 2022. Fairness in rankings and recommendations: an overview. The VLDB Journal (2022)."},{"key":"e_1_3_2_1_119_1","volume-title":"Measuring and narrowing the compositionality gap in language models. arXiv","author":"Press Ofir","year":"2022","unstructured":"Ofir Press, Muru Zhang, Sewon Min, Ludwig Schmidt, Noah A Smith, and Mike Lewis. 2022. Measuring and narrowing the compositionality gap in language models. arXiv (2022)."},{"key":"e_1_3_2_1_120_1","volume-title":"Summarization is (almost) dead. arXiv","author":"Pu Xiao","year":"2023","unstructured":"Xiao Pu, Mingqi Gao, and Xiaojun Wan. 2023. Summarization is (almost) dead. arXiv (2023)."},{"key":"e_1_3_2_1_121_1","volume-title":"Reducing gender bias in word-level language models with a gender-equalizing loss function. arXiv","author":"Qian Yusu","year":"2019","unstructured":"Yusu Qian, Urwa Muaz, Ben Zhang, and Jae Won Hyun. 2019. Reducing gender bias in word-level language models with a gender-equalizing loss function. arXiv (2019)."},{"key":"e_1_3_2_1_122_1","volume-title":"T5score: Discriminative fine-tuning of generative evaluation metrics. arXiv","author":"Qin Yiwei","year":"2022","unstructured":"Yiwei Qin, Weizhe Yuan, Graham Neubig, and Pengfei Liu. 2022. T5score: Discriminative fine-tuning of generative evaluation metrics. arXiv (2022)."},{"key":"e_1_3_2_1_123_1","doi-asserted-by":"crossref","unstructured":"Zhen Qin Rolf Jagerman et al. 2023. Large language models are effective text rankers with pairwise ranking prompting. arXiv (2023).","DOI":"10.18653\/v1\/2024.findings-naacl.97"},{"key":"e_1_3_2_1_124_1","volume-title":"CLongEval: A Chinese Benchmark for Evaluating Long-Context Large Language Models. arXiv","author":"Qiu Zexuan","year":"2024","unstructured":"Zexuan Qiu, Jingjing Li, Shijue Huang, Wanjun Zhong, and Irwin King. 2024. CLongEval: A Chinese Benchmark for Evaluating Long-Context Large Language Models. arXiv (2024)."},{"key":"e_1_3_2_1_125_1","unstructured":"Colin Raffel Noam Shazeer et al. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research (2020)."},{"key":"e_1_3_2_1_126_1","doi-asserted-by":"crossref","unstructured":"Ori Ram Yoav Levine et al. 2023. In-context retrieval-augmented language models. arXiv (2023).","DOI":"10.1162\/tacl_a_00605"},{"key":"e_1_3_2_1_127_1","volume-title":"Educational Multi-Question Generation for Reading Comprehension. In BEA Workshop","author":"Rathod Manav","year":"2022","unstructured":"Manav Rathod, Tony Tu, and Katherine Stasaski. 2022. Educational Multi-Question Generation for Reading Comprehension. In BEA Workshop 2022."},{"key":"e_1_3_2_1_128_1","doi-asserted-by":"crossref","unstructured":"Mustafa Safdari Greg Serapio-Garc\u00eda et al. 2023. Personality traits in large language models. arXiv (2023).","DOI":"10.21203\/rs.3.rs-3296728\/v1"},{"key":"e_1_3_2_1_129_1","volume-title":"Verbosity bias in preference labeling by large language models. arXiv","author":"Saito Keita","year":"2023","unstructured":"Keita Saito, Akifumi Wachi, Koki Wataoka, and Youhei Akimoto. 2023. Verbosity bias in preference labeling by large language models. arXiv (2023)."},{"key":"e_1_3_2_1_130_1","unstructured":"Tom Sander Pierre Fernandez et al. 2024. Watermarking Makes Language Models Radioactive. arXiv (2024)."},{"key":"e_1_3_2_1_131_1","unstructured":"Victor Sanh Albert Webson et al. 2021. Multitask prompted training enables zero-shot task generalization. arXiv (2021)."},{"key":"e_1_3_2_1_132_1","volume-title":"Cognitive interference: Theories, methods, and findings","author":"Sarason Irwin G","unstructured":"Irwin G Sarason, Gregory R Pierce, and Barbara R Sarason. 2014. Cognitive interference: Theories, methods, and findings. Routledge."},{"key":"e_1_3_2_1_133_1","doi-asserted-by":"crossref","unstructured":"Patrick Schramowski Cigdem Turan et al. 2022. Large pre-trained language models contain human-like biases of what is right and wrong to do. Nature Machine Intelligence (2022).","DOI":"10.1038\/s42256-022-00458-8"},{"key":"e_1_3_2_1_134_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642459"},{"key":"e_1_3_2_1_135_1","unstructured":"Weijia Shi Anirudh Ajith et al. 2023. Detecting pretraining data from large language models. arXiv (2023)."},{"key":"e_1_3_2_1_136_1","volume-title":"Replug: Retrieval-augmented black-box language models. arXiv","author":"Shi Weijia","year":"2023","unstructured":"Weijia Shi and Sewonand others Min. 2023. Replug: Retrieval-augmented black-box language models. arXiv (2023)."},{"key":"e_1_3_2_1_137_1","unstructured":"Kurt Shuster Jing Xu et al. 2022. Blenderbot 3: a deployed conversational agent that continually learns to responsibly engage. arXiv (2022)."},{"key":"e_1_3_2_1_138_1","unstructured":"Amit Singhal et al. 2001. Modern information retrieval: A brief overview. IEEE Data Eng. Bull. (2001)."},{"key":"e_1_3_2_1_139_1","unstructured":"Karan Singhal Tao Tu et al. 2023. Towards expert-level medical question answering with large language models. arXiv (2023)."},{"key":"e_1_3_2_1_140_1","volume-title":"ChatGPT for Conversational Recommendation: Refining Recommendations by Reprompting with Feedback. arXiv","author":"Spurlock Kyle Dylan","year":"2024","unstructured":"Kyle Dylan Spurlock, Cagla Acun, Esin Saka, and Olfa Nasraoui. 2024. ChatGPT for Conversational Recommendation: Refining Recommendations by Reprompting with Feedback. arXiv (2024)."},{"key":"e_1_3_2_1_141_1","doi-asserted-by":"crossref","unstructured":"Hao Sun Zhexin Zhang et al. 2022. MoralDial: A framework to train and evaluate moral dialogue systems via moral discussions. arXiv (2022).","DOI":"10.18653\/v1\/2023.acl-long.123"},{"key":"e_1_3_2_1_142_1","volume-title":"Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agent. arXiv","author":"Sun Weiwei","year":"2023","unstructured":"Weiwei Sun, Lingyong Yan, Xinyu Ma, Pengjie Ren, Dawei Yin, and Zhaochun Ren. 2023. Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agent. arXiv (2023)."},{"key":"e_1_3_2_1_143_1","volume-title":"Recitation-augmented language models. arXiv","author":"Sun Zhiqing","year":"2022","unstructured":"Zhiqing Sun, Xuezhi Wang, Yi Tay, Yiming Yang, and Denny Zhou. 2022. Recitation-augmented language models. arXiv (2022)."},{"key":"e_1_3_2_1_144_1","volume-title":"Approximating Human Evaluation of Social Chatbots with Prompting. arXiv","author":"Svikhnushina Ekaterina","year":"2023","unstructured":"Ekaterina Svikhnushina and Pearl Pu. 2023. Approximating Human Evaluation of Social Chatbots with Prompting. arXiv (2023)."},{"key":"e_1_3_2_1_145_1","volume-title":"Generated Contexts: How Language Models Merge Generated and Retrieved Contexts for Open-Domain QA? ACL","author":"Tan Hexiang","year":"2024","unstructured":"Hexiang Tan, Fei Sun, et al. 2024. Blinded by Generated Contexts: How Language Models Merge Generated and Retrieved Contexts for Open-Domain QA? ACL (2024)."},{"key":"e_1_3_2_1_146_1","volume-title":"Found in the middle: Permutation self-consistency improves listwise ranking in large language models. arXiv","author":"Tang Raphael","year":"2023","unstructured":"Raphael Tang, Xinyu Zhang, Xueguang Ma, Jimmy Lin, and Ferhan Ture. 2023. Found in the middle: Permutation self-consistency improves listwise ranking in large language models. arXiv (2023)."},{"key":"e_1_3_2_1_147_1","unstructured":"Hugo Touvron Louis Martin et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv (2023)."},{"key":"e_1_3_2_1_148_1","doi-asserted-by":"crossref","unstructured":"Tom R Tyler and E Allan Lind. 2002. Procedural justice. In Handbook of justice research in law.","DOI":"10.1037\/t19566-000"},{"key":"e_1_3_2_1_149_1","volume":"199","author":"Tyler Tom R","unstructured":"Tom R Tyler and Heather J Smith. 1995. Social justice and social movements. (1995).","journal-title":"Heather J Smith."},{"key":"e_1_3_2_1_150_1","volume-title":"Saferdialogues: Taking feedback gracefully after conversational safety failures. arXiv","author":"Ung Megan","year":"2021","unstructured":"Megan Ung, Jing Xu, and Y-Lan Boureau. 2021. Saferdialogues: Taking feedback gracefully after conversational safety failures. arXiv (2021)."},{"key":"e_1_3_2_1_151_1","volume-title":"WASA: Watermark-based source attribution for large language model-generated data. arXiv","author":"Wang Jingtan","year":"2023","unstructured":"Jingtan Wang, Xinyang Lu, et al. 2023. WASA: Watermark-based source attribution for large language model-generated data. arXiv (2023)."},{"key":"e_1_3_2_1_152_1","doi-asserted-by":"crossref","unstructured":"Liwen Wang Yuanmeng Yan Keqing He Yanan Wu and Weiran Xu. 2021. Dynamically disentangling social bias from task-oriented representations with adversarial attack. In NAACL.","DOI":"10.18653\/v1\/2021.naacl-main.293"},{"key":"e_1_3_2_1_153_1","volume-title":"2023 g. Query2doc: Query Expansion with Large Language Models. arXiv","author":"Wang Liang","year":"2023","unstructured":"Liang Wang, Nan Yang, and Furu Wei. 2023 g. Query2doc: Query Expansion with Large Language Models. arXiv (2023)."},{"key":"e_1_3_2_1_154_1","volume-title":"2023 h. Recagent: A novel simulation paradigm for recommender systems. arXiv","author":"Wang Lei","year":"2023","unstructured":"Lei Wang, Jingsen Zhang, et al. 2023 h. Recagent: A novel simulation paradigm for recommender systems. arXiv (2023)."},{"key":"e_1_3_2_1_155_1","unstructured":"Peiyi Wang Lei Li et al. 2023. Large language models are not fair evaluators. arXiv (2023)."},{"key":"e_1_3_2_1_156_1","unstructured":"Rui Wang Pengyu Cheng and Ricardo Henao. 2023. Toward fairness in text generation via mutual information minimization based on importance sampling. In AISTATS."},{"key":"e_1_3_2_1_157_1","unstructured":"Weizhi Wang Li Dong et al. 2024. Augmenting language models with long-term memory. NeurIPS (2024)."},{"key":"e_1_3_2_1_158_1","volume-title":"2023 e. Improving Conversational Recommendation Systems via Bias Analysis and Language-Model-Enhanced Data Augmentation. arXiv","author":"Wang Xi","year":"2023","unstructured":"Xi Wang, Hossein A Rahmani, Jiqun Liu, and Emine Yilmaz. 2023 e. Improving Conversational Recommendation Systems via Bias Analysis and Language-Model-Enhanced Data Augmentation. arXiv (2023)."},{"key":"e_1_3_2_1_159_1","unstructured":"Xuezhi Wang Jason Wei et al. 2023 f. Self-Consistency Improves Chain of Thought Reasoning in Language Models. arxiv: 2203.11171 [cs.CL]"},{"key":"e_1_3_2_1_160_1","volume-title":"Self-instruct: Aligning language models with self-generated instructions. arXiv","author":"Wang Yizhong","year":"2022","unstructured":"Yizhong Wang, Yeganeh Kordi, et al. 2022. Self-instruct: Aligning language models with self-generated instructions. arXiv (2022)."},{"key":"e_1_3_2_1_161_1","volume-title":"2023 d. A survey on the fairness of recommender systems. ACM Transactions on Information Systems","author":"Wang Yifan","year":"2023","unstructured":"Yifan Wang, Weizhi Ma, Min Zhang, Yiqun Liu, and Shaoping Ma. 2023 d. A survey on the fairness of recommender systems. ACM Transactions on Information Systems (2023)."},{"key":"e_1_3_2_1_162_1","unstructured":"Jiaxin Wen Pei Ke et al. 2023. Unveiling the implicit toxicity in large language models. arXiv (2023)."},{"key":"e_1_3_2_1_163_1","volume-title":"Low frequency names exhibit bias and overfitting in contextualizing language models. arXiv","author":"Wolfe Robert","year":"2021","unstructured":"Robert Wolfe and Aylin Caliskan. 2021. Low frequency names exhibit bias and overfitting in contextualizing language models. arXiv (2021)."},{"key":"e_1_3_2_1_164_1","unstructured":"Tae-Jin Woo Woo-Jeoung Nam Yeong-Joon Ju and Seong-Whan Lee. 2023. Compensatory debiasing for gender imbalances in language models. In ICASSP."},{"key":"e_1_3_2_1_165_1","volume-title":"Ai-generated content (aigc): A survey. arXiv","author":"Wu Jiayang","year":"2023","unstructured":"Jiayang Wu, Wensheng Gan, Zefeng Chen, Shicheng Wan, and Hong Lin. 2023. Ai-generated content (aigc): A survey. arXiv (2023)."},{"key":"e_1_3_2_1_166_1","volume-title":"Exploring large language model for graph data understanding in online job recommendations. arXiv","author":"Wu Likang","year":"2023","unstructured":"Likang Wu, Zhaopeng Qiu, Zhi Zheng, Hengshu Zhu, and Enhong Chen. 2023. Exploring large language model for graph data understanding in online job recommendations. arXiv (2023)."},{"key":"e_1_3_2_1_167_1","unstructured":"Likang Wu Zhi Zheng et al. 2023. A Survey on Large Language Models for Recommendation. arXiv (2023)."},{"key":"e_1_3_2_1_168_1","volume-title":"Style over substance: Evaluation biases for large language models. arXiv","author":"Wu Minghao","year":"2023","unstructured":"Minghao Wu and Alham Fikri Aji. 2023. Style over substance: Evaluation biases for large language models. arXiv (2023)."},{"key":"e_1_3_2_1_169_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583296"},{"key":"e_1_3_2_1_170_1","volume-title":"2023 d. Do llms implicitly exhibit user discrimination in recommendation? an empirical study. arXiv","author":"Xu Chen","year":"2023","unstructured":"Chen Xu, Wenjie Wang, Yuxin Li, Liang Pang, Jun Xu, and Tat-Seng Chua. 2023 d. Do llms implicitly exhibit user discrimination in recommendation? an empirical study. arXiv (2023)."},{"key":"e_1_3_2_1_171_1","doi-asserted-by":"crossref","unstructured":"Jun Xu Xiangnan He and Hang Li. 2018. Deep Learning for Matching in Search and Recommendation. In SIGIR.","DOI":"10.1145\/3209978.3210181"},{"key":"e_1_3_2_1_172_1","volume-title":"2024 d. Prompting Large Language Models for Recommender Systems: A Comprehensive Framework and Empirical Analysis. arXiv","author":"Xu Lanling","year":"2024","unstructured":"Lanling Xu, Junjie Zhang, et al. 2024 d. Prompting Large Language Models for Recommender Systems: A Comprehensive Framework and Empirical Analysis. arXiv (2024)."},{"key":"e_1_3_2_1_173_1","unstructured":"Peng Xu Wei Ping et al. 2023. Retrieval meets long context large language models. arXiv (2023)."},{"key":"e_1_3_2_1_174_1","unstructured":"Shicheng Xu Danyang Hou et al. 2023. AI-Generated Images Introduce Invisible Relevance Bias to Text-Image Retrieval. arXiv (2023)."},{"key":"e_1_3_2_1_175_1","unstructured":"Shicheng Xu Liang Pang et al. 2024. List-aware reranking-truncation joint model for search and retrieval-augmented generation. WWW (2024)."},{"key":"e_1_3_2_1_176_1","unstructured":"Shicheng Xu Liang Pang et al. 2024. Search-in-the-Chain: Towards the Accurate Credible and Traceable Content Generation for Complex Knowledge-intensive Tasks. WWW (2024)."},{"key":"e_1_3_2_1_177_1","unstructured":"Shicheng Xu Liang Pang et al. 2024. Unsupervised Information Refinement Training of Large Language Models for Retrieval-Augmented Generation. arXiv (2024)."},{"key":"e_1_3_2_1_178_1","volume-title":"2024 e. Perils of Self-Feedback: Self-Bias Amplifies in Large Language Models. arXiv","author":"Xu Wenda","year":"2024","unstructured":"Wenda Xu, Guanglei Zhu, et al. 2024 e. Perils of Self-Feedback: Self-Bias Amplifies in Large Language Models. arXiv (2024)."},{"key":"e_1_3_2_1_179_1","unstructured":"Jintang Xue Yun-Cheng Wang et al. 2023. Bias and fairness in chatbots: An overview. arXiv (2023)."},{"key":"e_1_3_2_1_180_1","volume-title":"Adept: A debiasing prompt framework. In AAAI.","author":"Yang Ke","year":"2023","unstructured":"Ke Yang, Charles Yu, Yi R Fung, Manling Li, and Heng Ji. 2023. Adept: A debiasing prompt framework. In AAAI."},{"key":"e_1_3_2_1_181_1","doi-asserted-by":"crossref","unstructured":"Tao Yang Tianyuan Shi et al. 2023. PsyCoT: Psychological Questionnaire as Powerful Chain-of-Thought for Personality Detection. arXiv (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.216"},{"key":"e_1_3_2_1_182_1","volume-title":"Unified detoxifying and debiasing in language generation via inference-time adaptive optimization. arXiv","author":"Yang Zonghan","year":"2022","unstructured":"Zonghan Yang, Xiaoyuan Yi, Peng Li, Yang Liu, and Xing Xie. 2022. Unified detoxifying and debiasing in language generation via inference-time adaptive optimization. arXiv (2022)."},{"key":"e_1_3_2_1_183_1","unstructured":"Seonghyeon Ye Hyeonbin Hwang et al. 2023. Investigating the effectiveness of task-agnostic prefix prompt for instruction following. arXiv (2023)."},{"key":"e_1_3_2_1_184_1","volume-title":"Improving Language Models via Plug-and-Play Retrieval Feedback. arXiv","author":"Yu Wenhao","year":"2023","unstructured":"Wenhao Yu, Zhihan Zhang, Zhenwen Liang, Meng Jiang, and Ashish Sabharwal. 2023. Improving Language Models via Plug-and-Play Retrieval Feedback. arXiv (2023)."},{"key":"e_1_3_2_1_185_1","volume-title":"Bartscore: Evaluating generated text as text generation. NeurIPS","author":"Yuan Weizhe","year":"2021","unstructured":"Weizhe Yuan, Graham Neubig, and Pengfei Liu. 2021. Bartscore: Evaluating generated text as text generation. NeurIPS (2021)."},{"key":"e_1_3_2_1_186_1","volume-title":"Economics of chatgpt: A labor market view on the occupational impact of artificial intelligence. SSRN 4350925","author":"Zarifhonarvar Ali","year":"2023","unstructured":"Ali Zarifhonarvar. 2023. Economics of chatgpt: A labor market view on the occupational impact of artificial intelligence. SSRN 4350925 (2023)."},{"key":"e_1_3_2_1_187_1","volume-title":"Should we attend more or less? modulating attention for fairness. arXiv","author":"Zayed Abdelrahman","year":"2023","unstructured":"Abdelrahman Zayed, Goncalo Mordido, Samira Shabanian, and Sarath Chandar. 2023. Should we attend more or less? modulating attention for fairness. arXiv (2023)."},{"key":"e_1_3_2_1_188_1","volume-title":"Fairness in ranking: A survey. arXiv","author":"Zehlike Meike","year":"2021","unstructured":"Meike Zehlike, Ke Yang, and Julia Stoyanovich. 2021. Fairness in ranking: A survey. arXiv (2021)."},{"key":"e_1_3_2_1_189_1","volume-title":"2023 e. On generative agents in recommendation. arXiv","author":"Zhang An","year":"2023","unstructured":"An Zhang, Leheng Sheng, Yuxin Chen, Hao Li, Yang Deng, Xiang Wang, and Tat-Seng Chua. 2023 e. On generative agents in recommendation. arXiv (2023)."},{"key":"e_1_3_2_1_190_1","doi-asserted-by":"crossref","unstructured":"Jizhi Zhang Keqin Bao Yang Zhang Wenjie Wang Fuli Feng and Xiangnan He. 2023. Is chatgpt fair for recommendation? evaluating fairness in large language model recommendation. In RecSys.","DOI":"10.1145\/3604915.3608860"},{"key":"e_1_3_2_1_191_1","volume-title":"Agentcf: Collaborative learning with autonomous language agents for recommender systems. arXiv","author":"Zhang Junjie","year":"2023","unstructured":"Junjie Zhang, Yupeng Hou, et al. 2023. Agentcf: Collaborative learning with autonomous language agents for recommender systems. arXiv (2023)."},{"key":"e_1_3_2_1_192_1","doi-asserted-by":"crossref","unstructured":"Yang Zhang Fuli Feng et al. 2021. Causal intervention for leveraging popularity bias in recommendation. In SIGIR.","DOI":"10.1145\/3404835.3462875"},{"key":"e_1_3_2_1_193_1","unstructured":"Yue Zhang Yafu Li et al. 2023 d. Siren's Song in the AI Ocean: A Survey on Hallucination in Large Language Models. arxiv: 2309.01219 [cs.CL]"},{"key":"e_1_3_2_1_194_1","volume-title":"Safetybench: Evaluating the safety of large language models with multiple choice questions. arXiv","author":"Zhang Zhexin","year":"2023","unstructured":"Zhexin Zhang, Leqi Lei, et al. 2023. Safetybench: Evaluating the safety of large language models with multiple choice questions. arXiv (2023)."},{"key":"e_1_3_2_1_195_1","unstructured":"Chujie Zheng Hao Zhou Fandong Meng Jie Zhou and Minlie Huang. 2023. Large language models are not robust multiple choice selectors. In ICLR."},{"key":"e_1_3_2_1_196_1","unstructured":"Lianmin Zheng Wei-Lin Chiang et al. 2024. Judging llm-as-a-judge with mt-bench and chatbot arena. NeurIPS (2024)."},{"key":"e_1_3_2_1_197_1","volume-title":"Why Does ChatGPT Fall Short in Answering Questions Faithfully? arXiv","author":"Zheng Shen","year":"2023","unstructured":"Shen Zheng, Jie Huang, and Kevin Chen-Chuan Chang. 2023. Why Does ChatGPT Fall Short in Answering Questions Faithfully? arXiv (2023)."},{"key":"e_1_3_2_1_198_1","volume-title":"Generative job recommendations with large language model. arXiv","author":"Zheng Zhi","year":"2023","unstructured":"Zhi Zheng, Zhaopeng Qiu, Xiao Hu, Likang Wu, Hengshu Zhu, and Hui Xiong. 2023. Generative job recommendations with large language model. arXiv (2023)."},{"key":"e_1_3_2_1_199_1","volume-title":"Causal-debias: Unifying debiasing in pretrained language models and fine-tuning via causal invariant learning. In ACL.","author":"Zhou Fan","year":"2023","unstructured":"Fan Zhou, Yuzhou Mao, et al. 2023. Causal-debias: Unifying debiasing in pretrained language models and fine-tuning via causal invariant learning. In ACL."},{"key":"e_1_3_2_1_200_1","unstructured":"Yuqi Zhou Sunhao Dai et al. 2024. Source Echo Chamber: Exploring the Escalation of Source Bias in User Data and Recommender System Feedback Loop. arXiv (2024)."},{"key":"e_1_3_2_1_201_1","unstructured":"Yutao Zhu Huaying Yuan et al. 2023. Large language models for information retrieval: A survey. arXiv (2023)."},{"key":"e_1_3_2_1_202_1","doi-asserted-by":"crossref","unstructured":"Ziwei Zhu Yun He Xing Zhao and James Caverlee. 2021. Popularity bias in dynamic recommendation. In KDD.","DOI":"10.1145\/3447548.3467376"},{"key":"e_1_3_2_1_203_1","volume-title":"Exploring ai ethics of chatgpt: A diagnostic analysis. arXiv","author":"Zhuo Terry Yue","year":"2023","unstructured":"Terry Yue Zhuo, Yujin Huang, Chunyang Chen, and Zhenchang Xing. 2023. Exploring ai ethics of chatgpt: A diagnostic analysis. arXiv (2023)."},{"key":"e_1_3_2_1_204_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.bea-1.10"},{"key":"e_1_3_2_1_205_1","volume-title":"Automated distractor generation for fill-in-the-blank items using a prompt-based learning approach. Psychological Testing and Assessment Modeling","author":"Zu Jiyun","year":"2023","unstructured":"Jiyun Zu, Ikkyu Choi, and Jiangang Hao. 2023. Automated distractor generation for fill-in-the-blank items using a prompt-based learning approach. Psychological Testing and Assessment Modeling (2023)."}],"event":{"name":"KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Barcelona Spain","acronym":"KDD '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671458","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3637528.3671458","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:26Z","timestamp":1750291406000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671458"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,24]]},"references-count":205,"alternative-id":["10.1145\/3637528.3671458","10.1145\/3637528"],"URL":"https:\/\/doi.org\/10.1145\/3637528.3671458","relation":{},"subject":[],"published":{"date-parts":[[2024,8,24]]},"assertion":[{"value":"2024-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}