{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T21:11:01Z","timestamp":1743023461152,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":28,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819783663"},{"type":"electronic","value":"9789819783670"}],"license":[{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8367-0_33","type":"book-chapter","created":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T11:56:50Z","timestamp":1732795010000},"page":"555-570","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Prior Constraints-Based Reward Model Training for\u00a0Aligning Large Language Models"],"prefix":"10.1007","author":[{"given":"Hang","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Chenglong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yimin","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Tong","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Chunliang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jingbo","family":"Zhu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,29]]},"reference":[{"key":"33_CR1","doi-asserted-by":"crossref","unstructured":"Amini, A., Vieira, T., Cotterell, R.: Direct preference optimization with an offset. arXiv preprint arXiv:2402.10571 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.592"},{"key":"33_CR2","doi-asserted-by":"crossref","unstructured":"Bradley, R.A., Terry, M.E.: Rank analysis of incomplete block designs: I. Method Paired Comparisons. Biometrika 39(3\/4), 324\u2013345 (1952)","DOI":"10.1093\/biomet\/39.3-4.324"},{"key":"33_CR3","unstructured":"Cheng, P., Xie, J., Bai, K., Dai, Y., Du, N.: Everyone deserves a reward: Learning customized human preferences. arXiv preprint arXiv:2309.03126 (2023)"},{"key":"33_CR4","unstructured":"Coste, T., Anwar, U., Kirk, R., Krueger, D.: Reward model ensembles help mitigate overoptimization. arXiv preprint arXiv:2310.02743 (2023)"},{"key":"33_CR5","unstructured":"Cui, G., et al.: Ultrafeedback: Boosting language models with high-quality feedback. arXiv preprint arXiv:2310.01377 (2023)"},{"key":"33_CR6","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"33_CR7","unstructured":"Dubois, Y., et al.: Alpacafarm: a simulation framework for methods that learn from human feedback. Adv. Neural Inf. Proce. Syst. 36 (2024)"},{"key":"33_CR8","unstructured":"Ji, J., et\u00a0al.: Ai alignment: A comprehensive survey. arXiv preprint arXiv:2310.19852 (2023)"},{"key":"33_CR9","unstructured":"Lee, H., et al.: Rlaif: Scaling reinforcement learning from human feedback with ai feedback. arXiv preprint arXiv:2309.00267 (2023)"},{"key":"33_CR10","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text summarization branches out, pp. 74\u201381 (2004)"},{"key":"33_CR11","doi-asserted-by":"crossref","unstructured":"Luce, R.D.: Individual choice behavior: a theoretical analysis. Courier Corporation (2005)","DOI":"10.1037\/14396-000"},{"key":"33_CR12","unstructured":"Min, D.J., Perez-Rosas, V., Resnicow, K., Mihalcea, R.: Dynamic reward adjustment in multi-reward reinforcement learning for counselor reflection generation. arXiv preprint arXiv:2403.13578 (2024)"},{"key":"33_CR13","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. Adv. Neural. Inf. Process. Syst. 35, 27730\u201327744 (2022)"},{"issue":"2","key":"33_CR14","first-page":"193","volume":"24","author":"RL Plackett","year":"1975","unstructured":"Plackett, R.L.: The analysis of permutations. J. R. Stat. Soc.: Ser. C: Appl. Stat. 24(2), 193\u2013202 (1975)","journal-title":"J. R. Stat. Soc.: Ser. C: Appl. Stat."},{"key":"33_CR15","unstructured":"Rafailov, R., Sharma, A., Mitchell, E., Manning, C.D., Ermon, S., Finn, C.: Direct preference optimization: your language model is secretly a reward model. Adv. Neural Inf. Proce. Syst. 36 (2024)"},{"key":"33_CR16","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)"},{"key":"33_CR17","unstructured":"t al Stiennon, N., et al.: Learning to summarize with human feedback. Adv. Neural. Inf. Process. Syst. 33, 3008\u20133021 (2020)"},{"key":"33_CR18","unstructured":"Taori, R., et al.: Alpaca: a strong, replicable instruction-following model. Stanford Center Res. Found. Models. 3(6), 7 (2023). https:\/\/crfm.stanford.edu\/2023\/03\/13\/alpaca.html"},{"key":"33_CR19","unstructured":"Touvron, H., et\u00a0al.: Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"33_CR20","unstructured":"Wang, C., et al.: Learning evaluation models from large language models for sequence generation. arXiv preprint arXiv:2308.04386 (2023)"},{"key":"33_CR21","unstructured":"Wang, C., et al.: Esrl: Efficient sampling-based reinforcement learning for sequence generation. arXiv preprint arXiv:2308.02223 (2023)"},{"key":"33_CR22","unstructured":"Wang, Y., et\u00a0al.: Pandalm: An automatic evaluation benchmark for llm instruction tuning optimization. arXiv preprint arXiv:2306.05087 (2023)"},{"key":"33_CR23","unstructured":"Wu, Z., et al.: Fine-grained human feedback gives better rewards for language model training. Adv. Neural Inf. Proce. Syst. 36 (2024)"},{"key":"33_CR24","unstructured":"Xiao, T., Zhu, J.: Introduction to transformers: an nlp perspective. arXiv preprint arXiv:2311.17633 (2023)"},{"key":"33_CR25","first-page":"27263","volume":"34","author":"W Yuan","year":"2021","unstructured":"Yuan, W., Neubig, G., Liu, P.: Bartscore: evaluating generated text as text generation. Adv. Neural. Inf. Process. Syst. 34, 27263\u201327277 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"33_CR26","unstructured":"Yuan, Z., Yuan, H., Tan, C., Wang, W., Huang, S., Huang, F.: Rrhf: Rank responses to align language models with human feedback without tears. arXiv preprint arXiv:2304.05302 (2023)"},{"key":"33_CR27","unstructured":"Zhong, Y., et al.: Panacea: Pareto alignment via preference adaptation for llms. arXiv preprint arXiv:2402.02030 (2024)"},{"key":"33_CR28","unstructured":"Zhu, B., Jiao, J., Jordan, M.I.: Principled reinforcement learning with human feedback from pairwise or $$ k $$-wise comparisons. arXiv preprint arXiv:2301.11270 (2023)"}],"container-title":["Lecture Notes in Computer Science","Chinese Computational Linguistics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8367-0_33","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T12:09:37Z","timestamp":1732795777000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8367-0_33"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,29]]},"ISBN":["9789819783663","9789819783670"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8367-0_33","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,29]]},"assertion":[{"value":"29 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CCL","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China National Conference on Chinese Computational Linguistics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taiyuan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 July 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 July 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cncl2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/cips-cl.org\/static\/CCL2024\/en\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}