{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T13:37:16Z","timestamp":1742996236961,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":55,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819787012"},{"type":"electronic","value":"9789819787029"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8702-9_33","type":"book-chapter","created":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T14:47:03Z","timestamp":1738939623000},"page":"493-504","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Examining Policy Entropy of\u00a0Reinforcement Learning Agents for\u00a0Personalization Tasks"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7095-8236","authenticated-orcid":false,"given":"Anton","family":"Dereventsov","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5426-325X","authenticated-orcid":false,"given":"Andrew","family":"Starnes","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1375-0359","authenticated-orcid":false,"given":"Clayton","family":"Webster","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,2,8]]},"reference":[{"key":"33_CR1","unstructured":"Abel, D., MacGlashan, J., Littman, M.L.: Reinforcement learning as a framework for ethical decision making. In: Workshops at the thirtieth AAAI Conference on Artificial Intelligence (2016)"},{"issue":"1","key":"33_CR2","first-page":"4431","volume":"22","author":"A Agarwal","year":"2021","unstructured":"Agarwal, A., Kakade, S.M., Lee, J.D., Mahajan, G.: On the theory of policy gradient methods: optimality, approximation, and distribution shift. J. Mach. Learn. Res. 22(1), 4431\u20134506 (2021)","journal-title":"J. Mach. Learn. Res."},{"key":"33_CR3","unstructured":"Ahmed, Z., Le\u00a0Roux, N., Norouzi, M., Schuurmans, D.: Understanding the impact of entropy on policy optimization. In: International Conference on Machine Learning, pp. 151\u2013160. PMLR (2019)"},{"key":"33_CR4","doi-asserted-by":"publisher","unstructured":"Amatriain, X., Basilico, J.: Recommender systems in industry: a netflix case study, pp. 385\u2013419. Springer US, Boston, MA (2015). https:\/\/doi.org\/10.1007\/978-1-4899-7637-6_11","DOI":"10.1007\/978-1-4899-7637-6_11"},{"key":"33_CR5","unstructured":"Aspinall, M., Hamermesh, R.: Realizing the promise of personalized medicine. Harvard Business Rev. 85, 108\u201317, 165 (2007)"},{"issue":"3","key":"33_CR6","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1609\/aimag.v32i3.2361","volume":"32","author":"R Burke","year":"2011","unstructured":"Burke, R., Felfernig, A., G\u00f6ker, M.H.: Recommender systems: an overview. AI Mag. 32(3), 13\u201318 (2011). https:\/\/doi.org\/10.1609\/aimag.v32i3.2361","journal-title":"AI Mag."},{"key":"33_CR7","unstructured":"Chen, M., Gummadi, R., Harris, C., Schuurmans, D.: Surrogate objectives for batch policy optimization in one-step decision making. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"33_CR8","doi-asserted-by":"crossref","unstructured":"Dereventsov, A., Vatsavai, R., Webster, C.G.: On the unreasonable efficiency of state space clustering in personalization tasks. In: 2021 International Conference on Data Mining Workshops (ICDMW), pp. 742\u2013749. IEEE (2021)","DOI":"10.1109\/ICDMW53433.2021.00097"},{"issue":"8","key":"33_CR9","first-page":"1178","volume":"21","author":"Z Dou","year":"2008","unstructured":"Dou, Z., Song, R., Wen, J.R., Yuan, X.: Evaluating the effectiveness of personalized web search. IEEE Trans. Knowl. Data Eng. 21(8), 1178\u20131190 (2008)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"33_CR10","unstructured":"Dud\u00edk, M., Langford, J., Li, L.: Doubly robust policy evaluation and learning. In: Proceedings of the 28th International Conference on International Conference on Machine Learning, pp. 1097\u20131104 (2011)"},{"key":"33_CR11","doi-asserted-by":"publisher","unstructured":"Ferretti, S., Mirri, S., Prandi, C., Salomoni, P.: Automatic web content personalization through reinforcement learning. J. Syst. Softw. 121, 157\u2013169 (2016). https:\/\/doi.org\/10.1016\/j.jss.2016.02.008, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0164121216000443","DOI":"10.1016\/j.jss.2016.02.008"},{"key":"33_CR12","unstructured":"Garg, S., Tosatto, S., Pan, Y., White, M., Mahmood, A.R.: An alternate policy gradient estimator for softmax policies. arXiv preprint arXiv:2112.11622 (2021)"},{"issue":"12","key":"33_CR13","doi-asserted-by":"publisher","first-page":"491","DOI":"10.1016\/S0167-7799(01)01814-5","volume":"19","author":"GS Ginsburg","year":"2001","unstructured":"Ginsburg, G.S., McCarthy, J.J.: Personalized medicine: revolutionizing drug discovery and patient care. Trends Biotechnol. 19(12), 491\u2013496 (2001)","journal-title":"Trends Biotechnol."},{"key":"33_CR14","doi-asserted-by":"publisher","unstructured":"Gomez-Uribe, C.A., Hunt, N.: The netflix recommender system: Algorithms, business value, and innovation. ACM Trans. Manage. Inf. Syst. (TMIS) 6(4) (2016). https:\/\/doi.org\/10.1145\/2843948","DOI":"10.1145\/2843948"},{"key":"33_CR15","doi-asserted-by":"crossref","unstructured":"Grondman, I., Busoniu, L., Lopes, G.A., Babuska, R.: A survey of actor-critic reinforcement learning: standard and natural policy gradients. IEEE Trans. Syst. Man Cybern. Part C (Appl. Rev.) 42(6), 1291\u20131307 (2012)","DOI":"10.1109\/TSMCC.2012.2218595"},{"key":"33_CR16","doi-asserted-by":"crossref","unstructured":"Harrison, R.M., Dereventsov, A., Bibin, A.: Zero-shot recommendations with pre-trained large language models for multimodal nudging. In: 2023 IEEE International Conference on Data Mining Workshops (ICDMW), pp. 1535\u20131542. IEEE (2023)","DOI":"10.1109\/ICDMW60847.2023.00195"},{"key":"33_CR17","doi-asserted-by":"crossref","unstructured":"Hassouni, A.e., Hoogendoorn, M., Otterlo, M.v., Barbaro, E.: Personalization of health interventions using cluster-based reinforcement learning. In: International Conference on Principles and Practice of Multi-Agent Systems, pp. 467\u2013475. Springer (2018)","DOI":"10.1007\/978-3-030-03098-8_31"},{"key":"33_CR18","doi-asserted-by":"publisher","unstructured":"den Hengst, F., Grua, E., el\u00a0Hassouni, A., Hoogendoorn, M.: Reinforcement learning for personalization: a systematic literature review. Data Sci. 3, 1\u201341 (2020). https:\/\/doi.org\/10.3233\/DS-200028","DOI":"10.3233\/DS-200028"},{"key":"33_CR19","unstructured":"Ie, E., et al.: Recsim: a configurable simulation platform for recommender systems. arXiv preprint arXiv:1909.04847 (2019)"},{"key":"33_CR20","doi-asserted-by":"publisher","unstructured":"Jacobson, K., Murali, V., Newett, E., Whitman, B., Yon, R.: Music personalization at spotify. In: Proceedings of the 10th ACM Conference on Recommender Systems, RecSys 2016, p.\u00a0373. Association for Computing Machinery, New York, NY, USA (2016). https:\/\/doi.org\/10.1145\/2959100.2959120","DOI":"10.1145\/2959100.2959120"},{"key":"33_CR21","doi-asserted-by":"publisher","first-page":"133653","DOI":"10.1109\/ACCESS.2019.2941229","volume":"7","author":"B Jang","year":"2019","unstructured":"Jang, B., Kim, M., Harerimana, G., Kim, J.W.: Q-learning algorithms: a comprehensive classification and applications. IEEE Access 7, 133653\u2013133667 (2019)","journal-title":"IEEE Access"},{"key":"33_CR22","unstructured":"Langford, J., Zhang, T.: The epoch-greedy algorithm for multi-armed bandits with side information. In: Advances in Neural Information Processing Systems, vol. 20 (2007)"},{"key":"33_CR23","unstructured":"Lasalvia, L.: Personalization and standardization: Can we have it all? J. Precis. Med. 6(1) (2020)"},{"key":"33_CR24","unstructured":"Lei, H., Tewari, A., Murphy, S.A.: An actor-critic contextual bandit algorithm for personalized mobile health interventions. arXiv preprint arXiv:1706.09090 (2017)"},{"key":"33_CR25","doi-asserted-by":"crossref","unstructured":"Li, L., Chu, W., Langford, J., Schapire, R.E.: A contextual-bandit approach to personalized news article recommendation. In: Proceedings of the 19th International Conference on World Wide Web, pp. 661\u2013670 (2010)","DOI":"10.1145\/1772690.1772758"},{"issue":"1","key":"33_CR26","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1049\/cje.2019.10.004","volume":"29","author":"S Li","year":"2020","unstructured":"Li, S., Yan, Y., Ren, J., Zhou, Y., Zhang, Y.: A sample-efficient actor-critic algorithm for recommendation diversification. Chin. J. Electron. 29(1), 89\u201396 (2020)","journal-title":"Chin. J. Electron."},{"key":"33_CR27","first-page":"21130","volume":"33","author":"J Mei","year":"2020","unstructured":"Mei, J., Xiao, C., Dai, B., Li, L., Szepesv\u00e1ri, C., Schuurmans, D.: Escaping the gravitational pull of softmax. Adv. Neural. Inf. Process. Syst. 33, 21130\u201321140 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"33_CR28","unstructured":"Mnih, V., et al.: Asynchronous methods for deep reinforcement learning. In: International Conference on Machine Learning, pp. 1928\u20131937. PMLR (2016)"},{"key":"33_CR29","unstructured":"Mnih, V., et al.: Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602 (2013)"},{"key":"33_CR30","unstructured":"Nachum, O., Norouzi, M., Xu, K., Schuurmans, D.: Bridging the gap between value and policy based reinforcement learning. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"33_CR31","doi-asserted-by":"crossref","unstructured":"Pan, F., Cai, Q., Tang, P., Zhuang, F., He, Q.: Policy gradients for contextual recommendations. In: The World Wide Web Conference, pp. 1421\u20131431 (2019)","DOI":"10.1145\/3308558.3313616"},{"key":"33_CR32","unstructured":"Raffin, A., Hill, A., Gleave, A., Kanervisto, A., Ernestus, M., Dormann, N.: Stable-baselines3: reliable reinforcement learning implementations. J. Mach. Learn. Res. (2021)"},{"key":"33_CR33","doi-asserted-by":"crossref","unstructured":"Ricci, F., Rokach, L., Shapira, B.: Introduction to recommender systems handbook. In: Recommender Systems Handbook (2011)","DOI":"10.1007\/978-0-387-85820-3"},{"key":"33_CR34","doi-asserted-by":"publisher","unstructured":"Ricci, F., Rokach, L., Shapira, B.: Recommender systems: introduction and challenges, pp. 1\u201334. Springer US, Boston, MA (2015). https:\/\/doi.org\/10.1007\/978-1-4899-7637-6_1","DOI":"10.1007\/978-1-4899-7637-6_1"},{"key":"33_CR35","unstructured":"Rohde, D., Bonner, S., Dunlop, T., Vasile, F., Karatzoglou, A.: Recogym: a reinforcement learning environment for the problem of product recommendation in online advertising. arXiv preprint arXiv:1808.00720 (2018)"},{"key":"33_CR36","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)"},{"key":"33_CR37","doi-asserted-by":"publisher","unstructured":"Smith, B., Linden, G.: Two decades of recommender systems at amazon.com. IEEE Internet Comput. 21(3), 12\u201318 (2017). https:\/\/doi.org\/10.1109\/MIC.2017.72","DOI":"10.1109\/MIC.2017.72"},{"key":"33_CR38","doi-asserted-by":"crossref","unstructured":"Srivihok, A., Sukonmanee, P.: E-commerce intelligent agent: personalization travel support agent using q learning. In: Proceedings of the 7th International Conference on Electronic Commerce, pp. 287\u2013292 (2005)","DOI":"10.1145\/1089551.1089606"},{"key":"33_CR39","unstructured":"Sutton, R.S., Barto, A.G.: Reinforcement Learning: An Introduction. MIT Press (2018)"},{"key":"33_CR40","unstructured":"Sutton, R.S., Bowling, M.H., Pilarski, P.M.: The alberta plan for AI research. arXiv preprint arXiv:2208.11173 (2022)"},{"key":"33_CR41","unstructured":"Sutton, R.S., McAllester, D., Singh, S., Mansour, Y.: Policy gradient methods for reinforcement learning with function approximation. In: Advances in Neural Information Processing Systems, vol. 12 (1999)"},{"key":"33_CR42","doi-asserted-by":"crossref","unstructured":"Swaminathan, A., Joachims, T.: Counterfactual risk minimization: learning from logged bandit feedback. In: International Conference on Machine Learning, pp. 814\u2013823. PMLR (2015)","DOI":"10.1145\/2740908.2742564"},{"issue":"4","key":"33_CR43","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1177\/0146621619858674","volume":"44","author":"C Tan","year":"2020","unstructured":"Tan, C., Han, R., Ye, R., Chen, K.: Adaptive learning recommendation strategy based on deep q-learning. Appl. Psychol. Meas. 44(4), 251\u2013266 (2020)","journal-title":"Appl. Psychol. Meas."},{"key":"33_CR44","doi-asserted-by":"crossref","unstructured":"Tang, L., Jiang, Y., Li, L., Zeng, C., Li, T.: Personalized recommendation via parameter-free contextual bandits. In: Proceedings of the 38th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 323\u2013332 (2015)","DOI":"10.1145\/2766462.2767707"},{"key":"33_CR45","unstructured":"Thomas, P.S., Okal, B.: A notation for markov decision processes. arXiv preprint arXiv:1512.09075 (2015)"},{"issue":"1","key":"33_CR46","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1007\/s10844-020-00633-6","volume":"57","author":"TNT Tran","year":"2021","unstructured":"Tran, T.N.T., Felfernig, A., Trattner, C., Holzinger, A.: Recommender systems in the healthcare domain: state-of-the-art and research issues. J. Intell. Inf. Syst. 57(1), 171\u2013201 (2021). https:\/\/doi.org\/10.1007\/s10844-020-00633-6","journal-title":"J. Intell. Inf. Syst."},{"key":"33_CR47","doi-asserted-by":"publisher","unstructured":"Vatian, A., et al.: Design patterns for personalization of healthcare process. In: Proceedings of the 2019 2nd International Conference on Geoinformatics and Data Analysis, pp. 83\u201388. ICGDA 2019, Association for Computing Machinery, New York, NY, USA (2019). https:\/\/doi.org\/10.1145\/3318236.3318249","DOI":"10.1145\/3318236.3318249"},{"key":"33_CR48","unstructured":"Wang, L., Cai, Q., Yang, Z., Wang, Z.: Neural policy gradient methods: global optimality and rates of convergence. arXiv preprint arXiv:1909.01150 (2019)"},{"key":"33_CR49","doi-asserted-by":"crossref","unstructured":"Wang, P., Rowe, J.P., Min, W., Mott, B.W., Lester, J.C.: Interactive narrative personalization with deep reinforcement learning. In: IJCAI, pp. 3852\u20133858 (2017)","DOI":"10.24963\/ijcai.2017\/538"},{"key":"33_CR50","doi-asserted-by":"publisher","unstructured":"Wang, X., Wang, Y., Hsu, D., Wang, Y.: Exploration in interactive personalized music recommendation: a reinforcement learning approach. ACM Trans. Multimedia Comput. Commun. Appl. 11(1) (2014). https:\/\/doi.org\/10.1145\/2623372","DOI":"10.1145\/2623372"},{"key":"33_CR51","doi-asserted-by":"publisher","first-page":"1003","DOI":"10.1613\/jair.1.12360","volume":"70","author":"J Whittlestone","year":"2021","unstructured":"Whittlestone, J., Arulkumaran, K., Crosby, M.: The societal implications of deep reinforcement learning. J. Artif. Intell. Res. 70, 1003\u20131030 (2021)","journal-title":"J. Artif. Intell. Res."},{"key":"33_CR52","doi-asserted-by":"crossref","unstructured":"Xin, X., Karatzoglou, A., Arapakis, I., Jose, J.: Supervised advantage actorcritic for recommender systems. In: ACM International WSDM Conference, vol. 15 (2022)","DOI":"10.1145\/3488560.3498494"},{"key":"33_CR53","unstructured":"Yang, W., Li, X., Zhang, Z.: A regularized approach to sparse optimal policy in reinforcement learning. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"33_CR54","first-page":"20913","volume":"33","author":"T Zahavy","year":"2020","unstructured":"Zahavy, T., et al.: A self-tuning actor-critic algorithm. Adv. Neural. Inf. Process. Syst. 33, 20913\u201320924 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"33_CR55","doi-asserted-by":"crossref","unstructured":"Zhu, F., Guo, J., Li, R., Huang, J.: Robust actor-critic contextual bandit for mobile health (mhealth) interventions. In: Proceedings of the 2018 ACM International Conference on Bioinformatics, Computational Biology, and Health Informatics, pp. 492\u2013501 (2018)","DOI":"10.1145\/3233547.3233554"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8702-9_33","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T14:47:34Z","timestamp":1738939654000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8702-9_33"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819787012","9789819787029"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8702-9_33","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"8 February 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ICPRAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition and Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Jeju Island","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Korea (Republic of)","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 June 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 June 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icprai2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/brain.korea.ac.kr\/icprai2024\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}