{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T23:46:28Z","timestamp":1769730388477,"version":"3.49.0"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031824807","type":"print"},{"value":"9783031824814","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-82481-4_7","type":"book-chapter","created":{"date-parts":[[2025,3,3]],"date-time":"2025-03-03T15:44:34Z","timestamp":1741016674000},"page":"92-105","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Refined Direct Preference Optimization with\u00a0Synthetic Data for\u00a0Behavioral Alignment of\u00a0LLMs"],"prefix":"10.1007","author":[{"given":"V\u00edctor","family":"Gallego","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,3,4]]},"reference":[{"key":"7_CR1","unstructured":"Askell, A., et al.: A general language assistant as a laboratory for alignment. arXiv preprint arXiv:2112.00861 (2021)"},{"key":"7_CR2","unstructured":"Bai, Y., et\u00a0al.: Constitutional AI: harmlessness from ai feedback. arXiv preprint arXiv:2212.08073 (2022)"},{"key":"7_CR3","unstructured":"Bowman, S.R.: Eight things to know about large language models. arXiv preprint arXiv:2304.00612 (2023)"},{"key":"7_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR5","unstructured":"Chen, W., Li, B.: GRATH: gradual self-truthifying for large language models. arXiv preprint arXiv:2401.12292 (2024)"},{"key":"7_CR6","unstructured":"Chiang, W.L., et al.: Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"7_CR7","unstructured":"Christiano, P.F., Leike, J., Brown, T., Martic, M., Legg, S., Amodei, D.: Deep reinforcement learning from human preferences. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"7_CR8","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"7_CR9","unstructured":"Deng, Y., Zhang, W., Chen, Z., Gu, Q.: Rephrase and respond: let large language models ask better questions for themselves. arXiv preprint arXiv:2311.04205 (2023)"},{"key":"7_CR10","unstructured":"Dong, H., et .: RAFT: Reward rAnked FineTuning for generative foundation model alignment. arXiv preprint arXiv:2304.06767 (2023)"},{"key":"7_CR11","unstructured":"Durbin, J.: Truthy DPO dataset (2024). https:\/\/huggingface.co\/datasets\/jondurbin\/truthy-dpo-v0.1"},{"key":"7_CR12","unstructured":"Gallego, V.: ZYN: zero-shot reward models with yes-no questions. arXiv preprint arXiv:2308.06385 (2023)"},{"key":"7_CR13","unstructured":"Gallego, V.: Distilled self-critique of LLMs with synthetic data: a Bayesian perspective. In: The Second Tiny Papers Track at ICLR 2024 (2024). https:\/\/openreview.net\/forum?id=AfVtVrCH9U"},{"key":"7_CR14","unstructured":"Glaese, A., et\u00a0al.: Improving alignment of dialogue agents via targeted human judgements. arXiv preprint arXiv:2209.14375 (2022)"},{"key":"7_CR15","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. Adv. Neural Inf. Process. Syst. 27 (2014)"},{"key":"7_CR16","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)"},{"key":"7_CR17","unstructured":"Hu, E.J., et al.: LORA: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"7_CR18","unstructured":"Jiang, A.Q., et\u00a0al.: Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)"},{"key":"7_CR19","doi-asserted-by":"publisher","unstructured":"Josifoski, M., Sakota, M., Peyrard, M., West, R.: Exploiting asymmetry for synthetic training data generation: SynthIE and the case of information extraction. In: Bouamor, H., Pino, J., Bali, K. (eds.) Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 1555\u20131574. Association for Computational Linguistics, Singapore (2023). https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.96","DOI":"10.18653\/v1\/2023.emnlp-main.96"},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"Kim, D., et al.: SOLAR 10.7b: scaling large language models with simple yet effective depth up-scaling (2023)","DOI":"10.18653\/v1\/2024.naacl-industry.3"},{"key":"7_CR21","unstructured":"Lee, H., et al.: RLAIF: scaling reinforcement learning from human feedback with AI feedback. arXiv preprint arXiv:2309.00267 (2023)"},{"key":"7_CR22","unstructured":"Li, Y., Bubeck, S., Eldan, R., Del\u00a0Giorno, A., Gunasekar, S., Lee, Y.T.: Textbooks are all you need II: phi-1.5 technical report. arXiv preprint arXiv:2309.05463 (2023)"},{"key":"7_CR23","unstructured":"Liu, B., et al.: TinyGSM: achieving >80% on GSM8k with small language models. arXiv preprint arXiv:2312.09241 (2023)"},{"key":"7_CR24","unstructured":"Liu, T., et al.: Statistical rejection sampling improves preference optimization. arXiv preprint arXiv:2309.06657 (2023)"},{"key":"7_CR25","unstructured":"Luo, H., et al.: WizardMath: empowering mathematical reasoning for large language models via reinforced Evol-instruct. arXiv preprint arXiv:2308.09583 (2023)"},{"key":"7_CR26","unstructured":"Madaan, A., et\u00a0al.: Self-refine: iterative refinement with self-feedback. arXiv preprint arXiv:2303.17651 (2023)"},{"key":"7_CR27","unstructured":"OpenAI: GPT-4 technical report (2023)"},{"key":"7_CR28","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback (2022)"},{"key":"7_CR29","unstructured":"Perez, E., et\u00a0al.: Discovering language model behaviors with model-written evaluations. arXiv preprint arXiv:2212.09251 (2022)"},{"key":"7_CR30","unstructured":"Prasad, A., Stengel-Eskin, E., Bansal, M.: Rephrase, augment, reason: visual grounding of questions for vision-language models. arXiv preprint arXiv:2310.05861 (2023)"},{"key":"7_CR31","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners (2019). https:\/\/api.semanticscholar.org\/CorpusID:160025533"},{"key":"7_CR32","unstructured":"Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., Finn, C.: Direct preference optimization: your language model is secretly a reward model. arXiv preprint arXiv:2305.18290 (2023)"},{"key":"7_CR33","unstructured":"Roziere, B., et\u00a0al.: Code Llama: open foundation models for code. arXiv preprint arXiv:2308.12950 (2023)"},{"key":"7_CR34","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)"},{"key":"7_CR35","doi-asserted-by":"crossref","unstructured":"Shen, X., Chen, Z., Backes, M., Shen, Y., Zhang, Y.: Do anything now: characterizing and evaluating in-the-wild jailbreak prompts on large language models. arXiv preprint arXiv:2308.03825 (2023)","DOI":"10.1145\/3658644.3670388"},{"key":"7_CR36","unstructured":"Taori, R., et al.: Stanford alpaca: an instruction-following llama model (2023). https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"7_CR37","unstructured":"teknium: OpenHermes-2.5-Mistral-7b (2023). https:\/\/huggingface.co\/teknium\/OpenHermes-2.5-Mistral-7B"},{"key":"7_CR38","unstructured":"Tunstall, L., et\u00a0al.: Zephyr: direct distillation of LM alignment. arXiv preprint arXiv:2310.16944 (2023)"},{"key":"7_CR39","unstructured":"Turner, R.E.: An introduction to transformers. arXiv preprint arXiv:2304.10557 (2023)"},{"key":"7_CR40","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"7_CR41","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR42","unstructured":"Weston, J., Sukhbaatar, S.: System 2 attention (is something you might need too). arXiv preprint arXiv:2311.11829 (2023)"},{"key":"7_CR43","unstructured":"Yang, Y., et al.: Decoding data quality via synthetic corruptions: embedding-guided pruning of code data. arXiv preprint arXiv:2312.02418 (2023)"},{"key":"7_CR44","unstructured":"Yu, L., et al.: MetaMath: Bootstrap your own mathematical questions for large language models. arXiv preprint arXiv:2309.12284 (2023)"},{"key":"7_CR45","unstructured":"Yuan, W., Pang, R.Y., Cho, K., Sukhbaatar, S., Xu, J., Weston, J.: Self-rewarding language models. arXiv preprint arXiv:2401.10020 (2024)"},{"key":"7_CR46","unstructured":"Yuan, Z., Yuan, H., Li, C., Dong, G., Tan, C., Zhou, C.: Scaling relationship on learning mathematical reasoning with large language models. arXiv preprint arXiv:2308.01825 (2023)"},{"key":"7_CR47","unstructured":"Yuan, Z., Yuan, H., Tan, C., Wang, W., Huang, S., Huang, F.: RRHF: rank responses to align language models with human feedback without tears. arXiv preprint arXiv:2304.05302 (2023)"},{"key":"7_CR48","unstructured":"Zhang, S., et\u00a0al.: Instruction tuning for large language models: a survey. arXiv preprint arXiv:2308.10792 (2023)"},{"key":"7_CR49","unstructured":"Ziegler, D.M., et al.: Fine-tuning language models from human preferences. arXiv preprint arXiv:1909.08593 (2019)"},{"key":"7_CR50","unstructured":"Zou, A., Wang, Z., Kolter, J.Z., Fredrikson, M.: Universal and transferable adversarial attacks on aligned language models (2023)"}],"container-title":["Lecture Notes in Computer Science","Machine Learning, Optimization, and Data Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-82481-4_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,3]],"date-time":"2025-03-03T15:44:43Z","timestamp":1741016683000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-82481-4_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031824807","9783031824814"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-82481-4_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"4 March 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that\u00a0are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"LOD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Machine Learning, Optimization, and Data Science","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Castiglione della Pescaia","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mod2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/lod2024.icas.events\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}