{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,2]],"date-time":"2025-08-02T17:06:51Z","timestamp":1754154411946,"version":"3.41.2"},"publisher-location":"Singapore","reference-count":42,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819500130"},{"type":"electronic","value":"9789819500147"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-95-0014-7_20","type":"book-chapter","created":{"date-parts":[[2025,7,24]],"date-time":"2025-07-24T10:06:19Z","timestamp":1753351579000},"page":"233-244","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Introspective Reward Modeling via Inverse Reinforcement Learning for LLM Alignment"],"prefix":"10.1007","author":[{"given":"Zhiqiang","family":"Wang","sequence":"first","affiliation":[]},{"given":"Ruoxi","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Shaowei","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Yizhong","family":"Ding","sequence":"additional","affiliation":[]},{"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,25]]},"reference":[{"issue":"1","key":"20_CR1","first-page":"15","volume":"36","author":"L Aroyo","year":"2015","unstructured":"Aroyo, L., Welty, C.: Truth is a lie: crowd truth and the seven myths of human annotation. AI Mag. 36(1), 15\u201324 (2015)","journal-title":"AI Mag."},{"key":"20_CR2","unstructured":"Cao, S., Cheng, R., Wang, Z.: Agr: Age group fairness reward for bias mitigation in llms (2024). arXiv preprintarXiv:2409.04340"},{"key":"20_CR3","unstructured":"Casper, S., et al.: Open problems and fundamental limitations of reinforcement learning from human feedback (2023). arXiv preprint arXiv:2307.15217"},{"key":"20_CR4","unstructured":"Chao, P., Robey, A., Dobriban, E., Hassani, H., Pappas, G.J., Wong, E.: Jailbreaking black box large language modelsin twenty queries (2023). arXiv preprint arXiv:2310.08419"},{"key":"20_CR5","unstructured":"Cheng, R., Ding, Y., Cao, S., Yuan, S., Wang, Z., Jia, X.: Bamba: A bimodal adversarial multi-round black-box jailbreak attacker for lvlms (2024). arXiv preprint arXiv:2412.05892"},{"key":"20_CR6","unstructured":"Cheng, R., et al.: Reinforcement learning from multi-roledebates as feedback for bias mitigation in llms (2024). arXiv preprint arXiv:2404.10160"},{"key":"20_CR7","unstructured":"Dubois, Y., Liang, P., Hashimoto, T.: Length-controlled alpacaeval: A simple debiasing of automatic evaluators. In: First Conference on Language Modeling (2024)"},{"key":"20_CR8","unstructured":"Grattafiori, A., et al.: The llama 3 herd of models (2024). arXiv preprint arXiv:2407.21783"},{"key":"20_CR9","unstructured":"Hendrycks, D., et al.: Measuring mathematical problem solving with the math dataset (2021). arXiv preprint arXiv:2103.03874"},{"key":"20_CR10","unstructured":"Huang, Y., Gupta, S., Xia, M., Li, K., Chen, D.: Catastrophic jailbreak of open-source llms via exploiting generation (2023). arXiv preprint arXiv:2310.06987"},{"key":"20_CR11","unstructured":"Khera, A., Ghosh, R., Dutta, D.: Efficient alignment of large language models via data sampling (2024). arXiv preprint arXiv:2411.10545"},{"key":"20_CR12","first-page":"124292","volume":"37","author":"J Li","year":"2024","unstructured":"Li, J., Zeng, S., Wai, H.T., Li, C., Garcia, A., Hong, M.: Getting more juice out of the SFT data: Reward learning from human demonstration improves SFT for LLM alignment. Adv. Neural. Inf. Process. Syst. 37, 124292\u2013124318 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR13","unstructured":"Lin, Y., et al.: Mitigating the alignment tax of rlhf (2023). arXiv preprint arXiv:2309.06256"},{"issue":"2","key":"20_CR14","doi-asserted-by":"publisher","first-page":"181","DOI":"10.1038\/s42256-025-00985-0","volume":"7","author":"S Liu","year":"2025","unstructured":"Liu, S., et al.: Rethinking machine unlearning for large language models. Nat. Mach. Intell. 7(2), 181\u2013194 (2025). https:\/\/doi.org\/10.1038\/s42256-025-00985-0","journal-title":"Nat. Mach. Intell."},{"key":"20_CR15","unstructured":"Liu, Y., et al.: Trustworthy llms: a survey and guideline for evaluating large language models\u2019 alignment (2023). arXiv preprint arXiv:2308.05374"},{"key":"20_CR16","unstructured":"Ng, A.Y., Russell, S., et al.: Algorithms for inverse reinforcement learning. In: Icml. vol. 1, p. 2 (2000)"},{"key":"20_CR17","first-page":"27730","volume":"35","author":"L Ouyang","year":"2022","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. Adv. Neural. Inf. Process. Syst. 35, 27730\u201327744 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR18","unstructured":"Qi, X., et al.: Safety alignment should be made more than just a few tokens deep (2024). arXiv preprint arXiv:2406.05946"},{"key":"20_CR19","unstructured":"Qi, X., Zeng, Y., Xie, T., Chen, P.Y., Jia, R., Mittal, P., Henderson, P.: Fine-tuning aligned language models compromises safety, even when users do not intend to! (2023). arXiv preprint arXiv:2310.03693"},{"key":"20_CR20","first-page":"53728","volume":"36","author":"R Rafailov","year":"2023","unstructured":"Rafailov, R., Sharma, A., Mitchell, E., Manning, C.D., Ermon, S., Finn, C.: Direct preference optimization: Your language model is secretly a reward model. Adv. Neural. Inf. Process. Syst. 36, 53728\u201353741 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"R\u00f6ttger, P., Kirk, H.R., Vidgen, B., Attanasio, G., Bianchi, F., Hovy, D.: Xstest: A test suite for identifying exaggerated safety behaviours in large language models (2023). arXiv preprint arXiv:2308.01263","DOI":"10.18653\/v1\/2024.naacl-long.301"},{"key":"20_CR22","unstructured":"Shao, Z., et al.: Deepseekmath: Pushing the limits of mathematical reasoning in open language models (2024). arXiv preprint arXiv:2402.03300"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Shokri, R., Stronati, M., Song, C., Shmatikov, V.: Membership inference attacks against machine learning models. In: 2017 IEEE symposium on security and privacy (SP), pp. 3\u201318. IEEE (2017)","DOI":"10.1109\/SP.2017.41"},{"key":"20_CR24","unstructured":"Souly, A., et al.: A strongreject for empty jailbreaks (2024). arXiv preprint arXiv:2402.10260"},{"key":"20_CR25","unstructured":"Sun, H., Shen, Y., Ton, J.F.: Rethinking reward modeling in preference-based large language model alignment. In: The Thirteenth International Conference on Learning Representations"},{"key":"20_CR26","unstructured":"Sun, H., Zhang, Z., Deng, J., Cheng, J., Huang, M.: Safety assessment of Chinese large language models (2023). arXiv preprint arXiv:2304.10436"},{"key":"20_CR27","first-page":"104471","volume":"37","author":"A Wachi","year":"2024","unstructured":"Wachi, A., Tran, T., Sato, R., Tanabe, T., Akimoto, Y.: Stepwise alignment for constrained language model policy optimization. Adv. Neural. Inf. Process. Syst. 37, 104471\u2013104520 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR28","unstructured":"Wang, B., et al..: Adversarial glue: A multi-task benchmark for robustness evaluation of language models (2021). arXiv preprint arXiv:2111.02840"},{"key":"20_CR29","unstructured":"Wang, F., et al.: Mrj-agent: An effective jailbreak agent for multi-round dialogue (2024). arXiv preprint arXiv:2411.03814"},{"key":"20_CR30","unstructured":"Wang, Y., Li, H., Han, X., Nakov, P., Baldwin, T.: Do-not-answer: a dataset for evaluating safeguards in llms (2023). arXiv preprint arXiv:2308.13387"},{"key":"20_CR31","doi-asserted-by":"crossref","unstructured":"Wang, Z., Yang, F., Wang, L., Zhao, P., Wang, H., Chen, L., Lin, Q., Wong, K.F.: Self-guard: Empower the llm to safeguard itself. arXiv preprint arXiv:2310.15851 (2023)","DOI":"10.18653\/v1\/2024.naacl-long.92"},{"key":"20_CR32","unstructured":"Wang, Z., et al.: A comprehensive survey of llm alignment techniques: Rlhf, rlaif, ppo, dpo and more (2024). arXiv preprint arXiv:2407.16216"},{"key":"20_CR33","unstructured":"Wei, J., et al.: Measuring short-form factuality in large language models (2024). arXiv preprint arXiv:2411.04368"},{"key":"20_CR34","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR35","unstructured":"Wu, T., et al.: Meta-rewarding language models: Self-improving alignment with LLM-as-a-meta-judge (2024). arXiv preprint arXiv:2407.19594"},{"key":"20_CR36","unstructured":"Xie, T., et al.: SORRY-bench: Systematically evaluating large language model safety refusal. In: The Thirteenth International Conference on Learning Representations (2025). https:\/\/openreview.net\/forum?id=YfKNaRktan"},{"key":"20_CR37","unstructured":"Yang, A., et al.: Qwen2. 5 technical report (2024). arXiv preprint arXiv:2412.15115"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Zeng, Y., Lin, H., Zhang, J., Yang, D., Jia, R., Shi, W.: How johnny can persuade llms to jailbreak them: Rethinking persuasion to challenge AI safety by humanizing LLMS. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 14322\u201314350 (2024)","DOI":"10.18653\/v1\/2024.acl-long.773"},{"key":"20_CR39","unstructured":"Zhang, Y., et al..: Stair: Improving safety alignment with introspective reasoning (2025). arXiv preprint arXiv:2502.02384"},{"key":"20_CR40","unstructured":"Zhao, W., Ren, X., Hessel, J., Cardie, C., Choi, Y., Deng, Y.: Wildchat: 1m chatgpt interaction logs in the wild (2024). arXiv preprint arXiv:2405.01470"},{"key":"20_CR41","unstructured":"Zou, A., et al.: Improving alignment and robustness with circuit breakers. In: The Thirty-eighth Annual Conference on Neural Information Processing Systems (2024)"},{"key":"20_CR42","unstructured":"Zou, A., Wang, Z., Carlini, N., Nasr, M., Kolter, J.Z., Fredrikson, M.: Universal and transferable adversarial attacks on aligned language models (2023). arXiv preprint arXiv:2307.15043"}],"container-title":["Lecture Notes in Computer Science","Advanced Intelligent Computing Technology and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-0014-7_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,24]],"date-time":"2025-07-24T10:06:29Z","timestamp":1753351589000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-0014-7_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819500130","9789819500147"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-0014-7_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"25 July 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Intelligent Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Ningbo","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 July 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 July 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icic2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.ic-icc.cn\/icg\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}