{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T22:14:26Z","timestamp":1730240066425,"version":"3.28.0"},"reference-count":15,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,7,25]],"date-time":"2024-07-25T00:00:00Z","timestamp":1721865600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,25]],"date-time":"2024-07-25T00:00:00Z","timestamp":1721865600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,7,25]]},"DOI":"10.1109\/icecet61485.2024.10698396","type":"proceedings-article","created":{"date-parts":[[2024,10,8]],"date-time":"2024-10-08T17:33:15Z","timestamp":1728408795000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing Large Language Model Performance with Reinforcement Learning from Human Feedback: A Comprehensive Study on Q&amp;A, Summarization, and Classification"],"prefix":"10.1109","author":[{"given":"Nirdosh","family":"Rawal","sequence":"first","affiliation":[{"name":"Genpact India Private Limited,Data Science &#x0026; Insights,Bengaluru,India"}]},{"given":"Prudhvith","family":"Tavva","sequence":"additional","affiliation":[{"name":"Genpact India Private Limited,Data Science &#x0026; Insights,Bengaluru,India"}]},{"given":"Prakash","family":"Selvakumar","sequence":"additional","affiliation":[{"name":"Genpact India Private Limited,Data Science &#x0026; Insights,Bengaluru,India"}]}],"member":"263","reference":[{"key":"ref1","first-page":"253","article-title":"Learning to summarize from human feedback","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems (NIPS20)","author":"Stiennon"},{"key":"ref2","article-title":"Training a helpful and harmless assistant with reinforcement learning from human feedback","author":"Bai","year":"2022","journal-title":"arXiv preprint"},{"key":"ref3","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Jiang","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref4","article-title":"Safe rlhf: Safe reinforcement learning from human feedback","author":"Sun","year":"2023","journal-title":"arXiv preprint"},{"key":"ref5","article-title":"Proximal policy optimization algorithms","author":"Dhariwal","year":"2017","journal-title":"arXiv preprint"},{"key":"ref6","article-title":"Revisiting de-sign choices in proximal policy optimization","author":"Mendler-D\u00fcnner","year":"2020","journal-title":"arXiv preprint"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CAC48633.2019.8996875"},{"issue":"7","key":"ref8","first-page":"4600","article-title":"Proximal policy optimization with policy feedback","volume":"52","author":"Chen","year":"2021","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics: Systems"},{"key":"ref9","article-title":"Fine-tuning language models from human preferences","author":"Wu","year":"2019","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00200"},{"key":"ref11","article-title":"Rrhf: Rank responses to align language models with human feedback without tears","author":"Tan","year":"2023","journal-title":"arXiv preprint"},{"key":"ref12","first-page":"17506","article-title":"Pretraining language models with human preferences","volume-title":"International Conference on Machine Learning","author":"Chen"},{"key":"ref13","article-title":"Constitutional ai: Harmlessness from ai feedback","author":"Kundu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref14","article-title":"Reward learning from human preferences and demonstrations in atari","volume":"31","author":"Pohlen","year":"2018","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Direct preference-based policy optimization without reward modeling","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Zuo","key":"ref15"}],"event":{"name":"2024 International Conference on Electrical, Computer and Energy Technologies (ICECET)","start":{"date-parts":[[2024,7,25]]},"location":"Sydney, Australia","end":{"date-parts":[[2024,7,27]]}},"container-title":["2024 International Conference on Electrical, Computer and Energy Technologies (ICECET"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10697983\/10697986\/10698396.pdf?arnumber=10698396","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,9]],"date-time":"2024-10-09T06:25:16Z","timestamp":1728455116000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10698396\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,25]]},"references-count":15,"URL":"https:\/\/doi.org\/10.1109\/icecet61485.2024.10698396","relation":{},"subject":[],"published":{"date-parts":[[2024,7,25]]}}}