{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,18]],"date-time":"2025-09-18T10:39:06Z","timestamp":1758191946827,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["61977019, U1813206"],"award-info":[{"award-number":["61977019, U1813206"]}]},{"name":"Shenzhen Fundamental Research Program","award":["JCYJ20180507183837726, JCYJ20220818102415033, JSGG20201103093802006"],"award-info":[{"award-number":["JCYJ20180507183837726, JCYJ20220818102415033, JSGG20201103093802006"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,18]]},"DOI":"10.1145\/3719545.3721107","type":"proceedings-article","created":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T09:38:41Z","timestamp":1758015521000},"page":"31-38","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Logarithmic Function Matters Policy Gradient Deep Reinforcement Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7520-306X","authenticated-orcid":false,"given":"Qi","family":"Liu","sequence":"first","affiliation":[{"name":"Guangdong Key Laboratory of Intelligent Morphing Mechanisms and Adaptive Robotics, School of Intelligence Science and Engineering, Harbin Institute of Technology, Shenzhen, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2314-6911","authenticated-orcid":false,"given":"Jingxiang","family":"Guo","sequence":"additional","affiliation":[{"name":"Guangdong Key Laboratory of Intelligent Morphing Mechanisms and Adaptive Robotics, School of Intelligence Science and Engineering, Harbin Institute of Technology, Shenzhen, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4605-4468","authenticated-orcid":false,"given":"Zhongjian","family":"Qiao","sequence":"additional","affiliation":[{"name":"Tsinghua Shenzhen International Graduate School, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4307-5618","authenticated-orcid":false,"given":"Pengbin","family":"Chen","sequence":"additional","affiliation":[{"name":"Guangdong Key Laboratory of Intelligent Morphing Mechanisms and Adaptive Robotics, School of Intelligence Science and Engineering, Harbin Institute of Technology, Shenzhen, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7636-4586","authenticated-orcid":false,"given":"Jinxuan","family":"Zhu","sequence":"additional","affiliation":[{"name":"Guangdong Key Laboratory of Intelligent Morphing Mechanisms and Adaptive Robotics, School of Intelligence Science and Engineering, Harbin Institute of Technology, Shenzhen, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7890-9677","authenticated-orcid":false,"given":"Yanjie","family":"Li","sequence":"additional","affiliation":[{"name":"Guangdong Key Laboratory of Intelligent Morphing Mechanisms and Adaptive Robotics, School of Intelligence Science and Engineering, Harbin Institute of Technology, Shenzhen, Shenzhen, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,9,16]]},"reference":[{"doi-asserted-by":"publisher","unstructured":"Andrew\u00a0G. Barto Richard\u00a0S. Sutton and Charles\u00a0W. Anderson. 1983. Neuronlike adaptive elements that can solve difficult learning control problems. IEEE Transactions on Systems Man and Cybernetics SMC-13 5 (1983) 834\u2013846. 10.1109\/TSMC.1983.6313077","key":"e_1_3_3_1_2_2","DOI":"10.1109\/TSMC.1983.6313077"},{"doi-asserted-by":"publisher","key":"e_1_3_3_1_3_2","DOI":"10.1007\/978-3-642-35289-826"},{"unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014).","key":"e_1_3_3_1_4_2"},{"doi-asserted-by":"publisher","key":"e_1_3_3_1_5_2","DOI":"10.1007\/978-3-030-94662-3_14"},{"doi-asserted-by":"publisher","key":"e_1_3_3_1_6_2","DOI":"10.2991\/978-94-6463-370-2_10"},{"unstructured":"Qi Liu Jingxiang Guo Sixu Lin Shuaikang Ma Jinxuan Zhu and Yanjie Li. 2024. MASQ: Multi-Agent Reinforcement Learning for Single Quadruped Robot Locomotion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.13759 (2024).","key":"e_1_3_3_1_7_2"},{"doi-asserted-by":"crossref","unstructured":"Qi Liu Yanjie Li Shiyu Chen Ke Lin Xiongtao Shi and Yunjiang Lou. 2023. Distributional reinforcement learning with epistemic and aleatoric uncertainty estimation. Information Sciences 644 (2023) 119217.","key":"e_1_3_3_1_8_2","DOI":"10.1016\/j.ins.2023.119217"},{"doi-asserted-by":"publisher","key":"e_1_3_3_1_9_2","DOI":"10.1109\/CASE48305.2020.9217023"},{"doi-asserted-by":"crossref","unstructured":"Qi Liu Yanjie Li Yuecheng Liu Ke Lin Jianqi Gao and Yunjiang Lou. 2024. Data Efficient Deep Reinforcement Learning With Action-Ranked Temporal Difference Learning. IEEE Transactions on Emerging Topics in Computational Intelligence 8 4 (2024) 2949\u20132961.","key":"e_1_3_3_1_10_2","DOI":"10.1109\/TETCI.2024.3369641"},{"unstructured":"Qi Liu Yanjie Li Xiongtao Shi Ke Lin Yuecheng Liu and Yunjiang Lou. 2024. Distributional Policy Gradient With Distributional Value Function. IEEE Transactions on Neural Networks and Learning Systems (2024) 1\u201313.","key":"e_1_3_3_1_11_2"},{"unstructured":"Qi Liu Xiaopeng Zhang Mingshan Tan Shuaikang Ma Jinliang Ding and Yanjie Li. 2025. MASH: Cooperative-Heterogeneous Multi-Agent Reinforcement Learning for Single Humanoid Robot Locomotion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.10423 (2025).","key":"e_1_3_3_1_12_2"},{"doi-asserted-by":"publisher","key":"e_1_3_3_1_13_2","DOI":"10.1007\/978-3-030-94662-3_7"},{"doi-asserted-by":"publisher","key":"e_1_3_3_1_14_2","DOI":"10.1007\/978-3-030-94662-3_3"},{"key":"e_1_3_3_1_15_2","first-page":"1928","volume-title":"International Conference on Machine Learning","author":"Mnih Volodymyr","year":"2016","unstructured":"Volodymyr Mnih, Adria\u00a0Puigdomenech Badia, Mehdi Mirza, Alex Graves, Timothy Lillicrap, Tim Harley, David Silver, and Koray Kavukcuoglu. 2016. Asynchronous methods for deep reinforcement learning. In International Conference on Machine Learning. PMLR, 1928\u20131937."},{"doi-asserted-by":"crossref","unstructured":"Volodymyr Mnih Koray Kavukcuoglu David Silver Andrei\u00a0A Rusu Joel Veness Marc\u00a0G Bellemare Alex Graves Martin Riedmiller Andreas\u00a0K Fidjeland Georg Ostrovski et\u00a0al. 2015. Human-level control through deep reinforcement learning. Nature 518 7540 (2015) 529\u2013533.","key":"e_1_3_3_1_16_2","DOI":"10.1038\/nature14236"},{"doi-asserted-by":"crossref","unstructured":"Jan Peters and Stefan Schaal. 2008. Natural actor-critic. Neurocomputing 71 7-9 (2008) 1180\u20131190.","key":"e_1_3_3_1_17_2","DOI":"10.1016\/j.neucom.2007.11.026"},{"key":"e_1_3_3_1_18_2","first-page":"4295","volume-title":"International Conference on Machine Learning","author":"Rashid Tabish","year":"2018","unstructured":"Tabish Rashid, Mikayel Samvelyan, Christian Schroeder, Gregory Farquhar, Jakob Foerster, and Shimon Whiteson. 2018. QMIX: Monotonic value function factorisation for deep multi-agent reinforcement learning. In International Conference on Machine Learning. PMLR, 4295\u20134304."},{"key":"e_1_3_3_1_19_2","first-page":"1889","volume-title":"International Conference on Machine Learning","author":"Schulman John","year":"2015","unstructured":"John Schulman, Sergey Levine, Pieter Abbeel, Michael Jordan, and Philipp Moritz. 2015. Trust region policy optimization. In International Conference on Machine Learning. PMLR, 1889\u20131897."},{"key":"e_1_3_3_1_20_2","volume-title":"International Conference on Learning Representations","author":"Schulman John","year":"2016","unstructured":"John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel. 2016. High-dimensional continuous control using generalized advantage estimation. In International Conference on Learning Representations."},{"unstructured":"John Schulman Filip Wolski Prafulla Dhariwal Alec Radford and Oleg Klimov. 2017. Proximal policy optimization algorithms. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1707.06347 (2017).","key":"e_1_3_3_1_21_2"},{"doi-asserted-by":"publisher","key":"e_1_3_3_1_22_2","DOI":"10.2991\/978-94-6463-370-2_21"},{"key":"e_1_3_3_1_23_2","volume-title":"Reinforcement learning: An introduction","author":"Sutton Richard\u00a0S","year":"2018","unstructured":"Richard\u00a0S Sutton and Andrew\u00a0G Barto. 2018. Reinforcement learning: An introduction. MIT press."},{"key":"e_1_3_3_1_24_2","volume-title":"Advances in Neural Information Processing Systems","author":"Sutton Richard\u00a0S","year":"1999","unstructured":"Richard\u00a0S Sutton, David McAllester, Satinder Singh, and Yishay Mansour. 1999. Policy gradient methods for reinforcement learning with function approximation. In Advances in Neural Information Processing Systems, Vol.\u00a012."},{"unstructured":"T. Tieleman. 2012. Lecture 6.5\u2010rmsprop: Divide the Gradient by a Running Average of Its Recent Magnitude. 26\u00a0pages. https:\/\/cir.nii.ac.jp\/crid\/1370017282431050757","key":"e_1_3_3_1_25_2"},{"unstructured":"Masatoshi Uehara Chengchun Shi and Nathan Kallus. 2022. A Review of Off-Policy Evaluation in Reinforcement Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.06355 (2022).","key":"e_1_3_3_1_26_2"},{"key":"e_1_3_3_1_27_2","first-page":"3667","volume-title":"Advances in Neural Information Processing Systems","author":"Vlassis Nikos","year":"2021","unstructured":"Nikos Vlassis, Ashok Chandrashekar, Fernando Amat, and Nathan Kallus. 2021. Control variates for slate off-policy evaluation. In Advances in Neural Information Processing Systems, Vol.\u00a034. 3667\u20133679."},{"unstructured":"Rachel Ward Xiaoxia Wu and Leon Bottou. 2020. Adagrad stepsizes: Sharp convergence over nonconvex landscapes. Journal of Machine Learning Research 21 219 (2020) 1\u201330.","key":"e_1_3_3_1_28_2"},{"doi-asserted-by":"crossref","unstructured":"Ronald\u00a0J Williams. 1992. Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning 8 3 (1992) 229\u2013256.","key":"e_1_3_3_1_29_2","DOI":"10.1023\/A:1022672621406"},{"doi-asserted-by":"publisher","key":"e_1_3_3_1_30_2","DOI":"10.1007\/978-3-030-94662-3_4"}],"event":{"acronym":"DAI '24","name":"DAI '24: 6th International Conference on Distributed Artificial Intelligences","location":"Singapore Singapore"},"container-title":["Proceedings of the 2024 Sixth International Conference on Distributed Artificial Intelligences"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3719545.3721107","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T13:13:24Z","timestamp":1758114804000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3719545.3721107"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,18]]},"references-count":29,"alternative-id":["10.1145\/3719545.3721107","10.1145\/3719545"],"URL":"https:\/\/doi.org\/10.1145\/3719545.3721107","relation":{},"subject":[],"published":{"date-parts":[[2024,12,18]]},"assertion":[{"value":"2025-09-16","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}