{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:08:33Z","timestamp":1776884913756,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Beijing Natural Science Foundation","award":["Nos. L244063"],"award-info":[{"award-number":["Nos. L244063"]}]},{"name":"Peking University Medicine plus X Pilot Program-Key Technologies R&D Project","award":["2024YXXLHGG007"],"award-info":[{"award-number":["2024YXXLHGG007"]}]},{"name":"Health Data Research UK-The Alan Turing Institute Wellcome PhD Programme in Health Data Science","award":["Grant Ref: 218529\/Z\/19\/Z"],"award-info":[{"award-number":["Grant Ref: 218529\/Z\/19\/Z"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62402017, U23A20468"],"award-info":[{"award-number":["62402017, U23A20468"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Xuzhou Scientific Technological Projects","award":["KC23143"],"award-info":[{"award-number":["KC23143"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3696410.3714640","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T22:47:11Z","timestamp":1745362031000},"page":"2562-2578","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Adaptive Activation Steering: A Tuning-Free LLM Truthfulness Improvement Method for Diverse Hallucinations Categories"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7292-6868","authenticated-orcid":false,"given":"Tianlong","family":"Wang","sequence":"first","affiliation":[{"name":"School of Software and Microelectronics, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7380-1736","authenticated-orcid":false,"given":"Xianfeng","family":"Jiao","sequence":"additional","affiliation":[{"name":"Key Laboratory of High Confidence Software Technologies, Ministry of Education, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2640-6477","authenticated-orcid":false,"given":"Yinghao","family":"Zhu","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Software Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9487-8140","authenticated-orcid":false,"given":"Zhongzhi","family":"Chen","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4674-970X","authenticated-orcid":false,"given":"Yifan","family":"He","sequence":"additional","affiliation":[{"name":"School of Software and Microelectronics, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0520-7196","authenticated-orcid":false,"given":"Xu","family":"Chu","sequence":"additional","affiliation":[{"name":"Center on Frontiers of Computing Studies, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4951-8682","authenticated-orcid":false,"given":"Junyi","family":"Gao","sequence":"additional","affiliation":[{"name":"Centre for Medical Informatics, University of Edinburgh, Edinburgh, Scotland, United Kingdom and Health Data Research UK, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8026-9688","authenticated-orcid":false,"given":"Yasha","family":"Wang","sequence":"additional","affiliation":[{"name":"Key Laboratory of High Confidence Software Technologies, Ministry of Education, Beijing, China and National Engineering Research Center for Software Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5233-0624","authenticated-orcid":false,"given":"Liantao","family":"Ma","sequence":"additional","affiliation":[{"name":"Key Laboratory of High Confidence Software Technologies, Ministry of Education, Beijing, China and National Engineering Research Center for Software Engineering, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Understanding intermediate layers using linear classifier probes. arXiv preprint arXiv:1610.01644","author":"Alain Guillaume","year":"2016","unstructured":"Guillaume Alain and Yoshua Bengio. 2016. Understanding intermediate layers using linear classifier probes. arXiv preprint arXiv:1610.01644 (2016)."},{"key":"e_1_3_2_1_3_1","unstructured":"Yuntao Bai Andy Jones Kamal Ndousse Amanda Askell Anna Chen Nova DasSarma Dawn Drain Stanislav Fort Deep Ganguli Tom Henighan et al. 2022. Training a helpful and harmless assistant with reinforcement learning from human feedback. arXiv preprint arXiv:2204.05862 (2022)."},{"key":"e_1_3_2_1_4_1","volume-title":"Semantic photo manipulation with a generative image prior. arXiv preprint arXiv:2005.07727","author":"Bau David","year":"2020","unstructured":"David Bau, Hendrik Strobelt, William Peebles, Jonas Wulff, Bolei Zhou, Jun-Yan Zhu, and Antonio Torralba. 2020a. Semantic photo manipulation with a generative image prior. arXiv preprint arXiv:2005.07727 (2020)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1907375117"},{"key":"e_1_3_2_1_6_1","volume-title":"Probing classifiers: Promises, shortcomings, and advances. Computational Linguistics","author":"Belinkov Yonatan","year":"2016","unstructured":"Yonatan Belinkov. 2016. Probing classifiers: Promises, shortcomings, and advances. Computational Linguistics (2016), 1--12."},{"key":"e_1_3_2_1_7_1","volume-title":"LEACE: Perfect linear concept erasure in closed form. arXiv preprint arXiv:2306.03819","author":"Belrose Nora","year":"2023","unstructured":"Nora Belrose, David Schneider-Joseph, Shauli Ravfogel, Ryan Cotterell, Edward Raff, and Stella Biderman. 2023. LEACE: Perfect linear concept erasure in closed form. arXiv preprint arXiv:2306.03819 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models.","author":"Brown Davis","year":"2023","unstructured":"Davis Brown, Charles Godfrey, Cody Nizinski, Jonathan Tu, and Henry Kvinge. 2023. Robustness of edited neural networks. In ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models."},{"key":"e_1_3_2_1_9_1","volume-title":"Discovering latent knowledge in language models without supervision. arXiv preprint arXiv:2212.03827","author":"Burns Collin","year":"2022","unstructured":"Collin Burns, Haotian Ye, Dan Klein, and Jacob Steinhardt. 2022. Discovering latent knowledge in language models without supervision. arXiv preprint arXiv:2212.03827 (2022)."},{"key":"e_1_3_2_1_10_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality.","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al. 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality."},{"key":"e_1_3_2_1_11_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Eric Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et al. 2022. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_1_12_1","volume-title":"Plug and play language models: A simple approach to controlled text generation. arXiv preprint arXiv:1912.02164","author":"Dathathri Sumanth","year":"2019","unstructured":"Sumanth Dathathri, Andrea Madotto, Janice Lan, Jane Hung, Eric Frank, Piero Molino, Jason Yosinski, and Rosanne Liu. 2019. Plug and play language models: A simple approach to controlled text generation. arXiv preprint arXiv:1912.02164 (2019)."},{"key":"e_1_3_2_1_13_1","volume-title":"Chain-of-verification reduces hallucination in large language models. arXiv preprint arXiv:2309.11495","author":"Dhuliawala Shehzaad","year":"2023","unstructured":"Shehzaad Dhuliawala, Mojtaba Komeili, Jing Xu, Roberta Raileanu, Xian Li, Asli Celikyilmaz, and Jason Weston. 2023. Chain-of-verification reduces hallucination in large language models. arXiv preprint arXiv:2309.11495 (2023)."},{"key":"e_1_3_2_1_14_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_15_1","unstructured":"Nelson Elhage Tristan Hume Catherine Olsson Nicholas Schiefer Tom Henighan Shauna Kravec Zac Hatfield-Dodds Robert Lasenby Dawn Drain Carol Chen et al. 2022. Toy models of superposition. arXiv preprint arXiv:2209.10652 (2022)."},{"key":"e_1_3_2_1_16_1","unstructured":"N Elhage N Nanda C Olsson T Henighan N Joseph B Mann A Askell Y Bai A Chen T Conerly et al. 2021. A mathematical framework for transformer circuits. Transformer Circuits Thread (2021)."},{"key":"e_1_3_2_1_17_1","volume-title":"Erasing concepts from diffusion models. arXiv preprint arXiv:2303.07345","author":"Gandikota Rohit","year":"2023","unstructured":"Rohit Gandikota, Joanna Materzynska, Jaden Fiotto-Kaufman, and David Bau. 2023. Erasing concepts from diffusion models. arXiv preprint arXiv:2303.07345 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_1_19_1","volume-title":"Mitchell Wortsman, Suchin Gururangan, Ludwig Schmidt, Hannaneh Hajishirzi, and Ali Farhadi.","author":"Ilharco Gabriel","year":"2022","unstructured":"Gabriel Ilharco, Marco Tulio Ribeiro, Mitchell Wortsman, Suchin Gururangan, Ludwig Schmidt, Hannaneh Hajishirzi, and Ali Farhadi. 2022. Editing models with task arithmetic. arXiv preprint arXiv:2212.04089 (2022)."},{"key":"e_1_3_2_1_20_1","volume-title":"Improving activation steering in language models with mean-centring. arXiv preprint arXiv:2312.03813","author":"Jorgensen Ole","year":"2023","unstructured":"Ole Jorgensen, Dylan Cope, Nandi Schoots, and Murray Shanahan. 2023. Improving activation steering in language models with mean-centring. arXiv preprint arXiv:2312.03813 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Personas as a way to model truthfulness in language models. arXiv preprint arXiv:2310.18168","author":"Joshi Nitish","year":"2023","unstructured":"Nitish Joshi, Javier Rando, Abulhair Saparov, Najoung Kim, and He He. 2023. Personas as a way to model truthfulness in language models. arXiv preprint arXiv:2310.18168 (2023)."},{"key":"e_1_3_2_1_22_1","unstructured":"Saurav Kadavath Tom Conerly Amanda Askell Tom Henighan Dawn Drain Ethan Perez Nicholas Schiefer Zac Hatfield-Dodds Nova DasSarma Eli Tran-Johnson et al. 2022. Language models (mostly) know what they know. arXiv preprint arXiv:2207.05221 (2022)."},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Artificial Intelligence and Statistics. PMLR, 5250--5270","author":"Kleindessner Matth''aus","year":"2023","unstructured":"Matth''aus Kleindessner, Michele Donini, Chris Russell, and Muhammad Bilal Zafar. 2023. Efficient fair PCA for fair representation learning. In International Conference on Artificial Intelligence and Statistics. PMLR, 5250--5270."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3636454"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00276"},{"key":"e_1_3_2_1_26_1","volume-title":"International conference on machine learning. PMLR, 1558--1566","author":"Lindbo Larsen Anders Boesen","year":"2016","unstructured":"Anders Boesen Lindbo Larsen, S\u00f8ren Kaae S\u00f8nderby, Hugo Larochelle, and Ole Winther. 2016. Autoencoding beyond pixels using a learned similarity metric. In International conference on machine learning. PMLR, 1558--1566."},{"key":"e_1_3_2_1_27_1","volume-title":"Inference-Time Intervention: Eliciting Truthful Answers from a Language Model. arXiv preprint arXiv:2306.03341","author":"Li Kenneth","year":"2023","unstructured":"Kenneth Li, Oam Patel, Fernanda Vi\u00e9gas, Hanspeter Pfister, and Martin Wattenberg. 2023. Inference-Time Intervention: Eliciting Truthful Answers from a Language Model. arXiv preprint arXiv:2306.03341 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Li Kenneth","year":"2024","unstructured":"Kenneth Li, Oam Patel, Fernanda Vi\u00e9gas, Hanspeter Pfister, and Martin Wattenberg. 2024. Inference-time intervention: Eliciting truthful answers from a language model. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"Truthfulqa: Measuring how models mimic human falsehoods. arXiv preprint arXiv:2109.07958","author":"Lin Stephanie","year":"2021","unstructured":"Stephanie Lin, Jacob Hilton, and Owain Evans. 2021. Truthfulqa: Measuring how models mimic human falsehoods. arXiv preprint arXiv:2109.07958 (2021)."},{"key":"e_1_3_2_1_30_1","first-page":"16331","article-title":"Editgan: High-precision semantic image editing","volume":"34","author":"Ling Huan","year":"2021","unstructured":"Huan Ling, Karsten Kreis, Daiqing Li, Seung Wook Kim, Antonio Torralba, and Sanja Fidler. 2021. Editgan: High-precision semantic image editing. Advances in Neural Information Processing Systems, Vol. 34 (2021), 16331--16345.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_31_1","volume-title":"Patterns","volume":"4","author":"Ma Liantao","year":"2023","unstructured":"Liantao Ma, Chaohe Zhang, Junyi Gao, Xianfeng Jiao, Zhihao Yu, Yinghao Zhu, Tianlong Wang, Xinyu Ma, Yasha Wang, Wen Tang, et al. 2023. Mortality prediction with adaptive feature importance recalibration for peritoneal dialysis patients. Patterns, Vol. 4, 12 (2023)."},{"key":"e_1_3_2_1_32_1","first-page":"17359","article-title":"Locating and editing factual associations in GPT","volume":"35","author":"Meng Kevin","year":"2022","unstructured":"Kevin Meng, David Bau, Alex Andonian, and Yonatan Belinkov. 2022. Locating and editing factual associations in GPT. Advances in Neural Information Processing Systems, Vol. 35 (2022), 17359--17372.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_33_1","unstructured":"Jacob Menick Maja Trebacz Vladimir Mikulik John Aslanides Francis Song Martin Chadwick Mia Glaese Susannah Young Lucy Campbell-Gillingham Geoffrey Irving et al. 2022. Teaching language models to support answers with verified quotes. arXiv preprint arXiv:2203.11147 (2022)."},{"key":"e_1_3_2_1_34_1","volume-title":"Dean Carignan, and Eric Horvitz.","author":"Nori Harsha","year":"2023","unstructured":"Harsha Nori, Nicholas King, Scott Mayer McKinney, Dean Carignan, and Eric Horvitz. 2023. Capabilities of gpt-4 on medical challenge problems. arXiv preprint arXiv:2303.13375 (2023)."},{"key":"e_1_3_2_1_35_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_36_1","volume-title":"Sequence level training with recurrent neural networks. arXiv preprint arXiv:1511.06732","author":"Ranzato Marc'Aurelio","year":"2015","unstructured":"Marc'Aurelio Ranzato, Sumit Chopra, Michael Auli, and Wojciech Zaremba. 2015. Sequence level training with recurrent neural networks. arXiv preprint arXiv:1511.06732 (2015)."},{"key":"e_1_3_2_1_37_1","volume-title":"Kernelized Concept Erasure. arXiv preprint arXiv:2201.12191","author":"Ravfogel Shauli","year":"2022","unstructured":"Shauli Ravfogel, Francisco Vargas, Yoav Goldberg, and Ryan Cotterell. 2022. Kernelized Concept Erasure. arXiv preprint arXiv:2201.12191 (2022)."},{"key":"e_1_3_2_1_38_1","volume-title":"BLEURT: Learning robust metrics for text generation. arXiv preprint arXiv:2004.04696","author":"Sellam Thibault","year":"2020","unstructured":"Thibault Sellam, Dipanjan Das, and Ankur P Parikh. 2020. BLEURT: Learning robust metrics for text generation. arXiv preprint arXiv:2004.04696 (2020)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00926"},{"key":"e_1_3_2_1_40_1","volume-title":"Extracting latent steering vectors from pretrained language models. arXiv preprint arXiv:2205.05124","author":"Subramani Nishant","year":"2022","unstructured":"Nishant Subramani, Nivedita Suresh, and Matthew E Peters. 2022. Extracting latent steering vectors from pretrained language models. arXiv preprint arXiv:2205.05124 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Alpaca: A Strong, Replicable Instruction-Following Model","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B Hashimoto. 2023. Alpaca: A Strong, Replicable Instruction-Following Model. Stanford Center for Research on Foundation Models. https:\/\/crfm. stanford. edu\/2023\/03\/13\/alpaca. html (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"BERT rediscovers the classical NLP pipeline. arXiv preprint arXiv:1905.05950","author":"Tenney Ian","year":"2019","unstructured":"Ian Tenney, Dipanjan Das, and Ellie Pavlick. 2019. BERT rediscovers the classical NLP pipeline. arXiv preprint arXiv:1905.05950 (2019)."},{"key":"e_1_3_2_1_43_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023a. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_44_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023b. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"Activation addition: Steering language models without optimization. arXiv preprint arXiv:2308.10248","author":"Turner Alex","year":"2023","unstructured":"Alex Turner, Lisa Thiergart, David Udell, Gavin Leech, Ulisse Mini, and Monte MacDiarmid. 2023. Activation addition: Steering language models without optimization. arXiv preprint arXiv:2308.10248 (2023)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.645"},{"key":"e_1_3_2_1_47_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645471"},{"key":"e_1_3_2_1_49_1","volume-title":"Self-Instruct: Aligning Language Model with Self Generated Instructions. arXiv preprint arXiv:2212.10560","author":"Wang Yizhong","year":"2022","unstructured":"Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A Smith, Daniel Khashabi, and Hannaneh Hajishirzi. 2022. Self-Instruct: Aligning Language Model with Self Generated Instructions. arXiv preprint arXiv:2212.10560 (2022)."},{"key":"e_1_3_2_1_50_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824--24837."},{"key":"e_1_3_2_1_51_1","volume-title":"Sampling generative networks. arXiv preprint arXiv:1609.04468","author":"White Tom","year":"2016","unstructured":"Tom White. 2016. Sampling generative networks. arXiv preprint arXiv:1609.04468 (2016)."},{"key":"e_1_3_2_1_52_1","volume-title":"A critical evaluation of evaluations for long-form question answering. arXiv preprint arXiv:2305.18201","author":"Xu Fangyuan","year":"2023","unstructured":"Fangyuan Xu, Yixiao Song, Mohit Iyyer, and Eunsol Choi. 2023. A critical evaluation of evaluations for long-form question answering. arXiv preprint arXiv:2305.18201 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Truthx: Alleviating hallucinations by editing large language models in truthful space. arXiv preprint arXiv:2402.17811","author":"Zhang Shaolei","year":"2024","unstructured":"Shaolei Zhang, Tian Yu, and Yang Feng. 2024. Truthx: Alleviating hallucinations by editing large language models in truthful space. arXiv preprint arXiv:2402.17811 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Fine-tuning language models from human preferences. arXiv preprint arXiv:1909.08593","author":"Ziegler Daniel M","year":"2019","unstructured":"Daniel M Ziegler, Nisan Stiennon, Jeffrey Wu, Tom B Brown, Alec Radford, Dario Amodei, Paul Christiano, and Geoffrey Irving. 2019. Fine-tuning language models from human preferences. arXiv preprint arXiv:1909.08593 (2019)."},{"key":"e_1_3_2_1_55_1","unstructured":"Andy Zou Long Phan Sarah Chen James Campbell Phillip Guo Richard Ren Alexander Pan Xuwang Yin Mantas Mazeika Ann-Kathrin Dombrowski et al. 2023. Representation engineering: A top-down approach to ai transparency. arXiv preprint arXiv:2310.01405 (2023)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714640","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714640","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:56Z","timestamp":1750295936000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714640"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":55,"alternative-id":["10.1145\/3696410.3714640","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714640","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}