{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,23]],"date-time":"2026-05-23T22:06:50Z","timestamp":1779574010974,"version":"3.53.1"},"reference-count":52,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100007162","name":"Guangdong Provincial Department of Science and Technology","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100007162","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100019024","name":"Guangdong Polytechnic Normal University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100019024","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004735","name":"Hunan Provincial Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004735","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.neunet.2026.109110","type":"journal-article","created":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T16:10:57Z","timestamp":1778861457000},"page":"109110","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Collective reflection-based multi-agent reinforcement learning framework for task-oriented dialogue policy learning"],"prefix":"10.1016","volume":"203","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-5116-3739","authenticated-orcid":false,"given":"Kai","family":"Xu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhenyu","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1890-6508","authenticated-orcid":false,"given":"Yangyang","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bopeng","family":"Fang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109110_bib0001","series-title":"Proceedings of the 36th international conference on neural information processing systems","first-page":"28955","article-title":"Reincarnating reinforcement learning: Reusing prior computation to accelerate progress","author":"Agarwal","year":"2022"},{"key":"10.1016\/j.neunet.2026.109110_bib0002","series-title":"Robots and biological systems: Towards a new bionics?","first-page":"703","article-title":"Swarm intelligence in cellular robotic systems","author":"Beni","year":"1993"},{"key":"10.1016\/j.neunet.2026.109110_bib0003","series-title":"Convex functions: Dual description","first-page":"151","author":"Brinkhuis","year":"2020"},{"key":"10.1016\/j.neunet.2026.109110_bib0004","series-title":"Proceedings of the 2018 conference on empirical methods in natural language processing","first-page":"5016","article-title":"Microsoft dialogue challenge: Building end-to-end task-completion dialogue systems","author":"Budzianowski","year":"2018"},{"issue":"8","key":"10.1016\/j.neunet.2026.109110_bib0005","doi-asserted-by":"crossref","first-page":"5337","DOI":"10.1109\/TWC.2022.3233436","article-title":"Federated multi-agent deep reinforcement learning (fed-MADRL) for dynamic spectrum access","volume":"22","author":"Chang","year":"2023","journal-title":"IEEE Transactions on Wireless Communications"},{"key":"10.1016\/j.neunet.2026.109110_bib0006","series-title":"34th conference on neural information processing systems","first-page":"5527\u2014-5540","article-title":"Independent policy gradient methods for competitive reinforcement learning","volume":"vol. 33","author":"Daskalakis","year":"2020"},{"key":"10.1016\/j.neunet.2026.109110_bib0007","series-title":"The thirty-eighth AAAI conference on artificial intelligence","first-page":"17362","article-title":"Situation-dependent causal influence-based cooperative multi-agent reinforcement learning","volume":"Vol. 38","author":"Du","year":"2024"},{"key":"10.1016\/j.neunet.2026.109110_bib0008","doi-asserted-by":"crossref","DOI":"10.1016\/j.ins.2024.120483","article-title":"MOFS-REPLS: A large-scale multi-objective feature selection algorithm based on real-valued encoding and preference leadership strategy","volume":"667","author":"Fu","year":"2024","journal-title":"Information Sciences"},{"key":"10.1016\/j.neunet.2026.109110_bib0009","series-title":"Proceedings of the 37th AAAI conference on artificial intelligence, AAAI 2023","first-page":"7613","article-title":"Fast counterfactual inference for history-based reinforcement learning","volume":"Vol. 37","author":"Gao","year":"2023"},{"key":"10.1016\/j.neunet.2026.109110_bib0010","series-title":"Proceedings of the annual meeting of the association for computational linguistics","first-page":"2320","article-title":"Beyond the granularity: Multi-perspective dialogue collaborative selection for dialogue state tracking","author":"Guo","year":"2022"},{"key":"10.1016\/j.neunet.2026.109110_bib0011","series-title":"Collaborative computing: Networking, applications and worksharing","first-page":"396","article-title":"Learning dialogue policy efficiently through dyna proximal policy optimization","author":"Huang","year":"2023"},{"issue":"11","key":"10.1016\/j.neunet.2026.109110_bib0012","doi-asserted-by":"crossref","first-page":"7006","DOI":"10.1109\/TCOMM.2024.3409530","article-title":"A collaborative multi-agent deep reinforcement learning-based wireless power allocation with centralized training and decentralized execution","volume":"72","author":"Kopic","year":"2024","journal-title":"IEEE Transactions on Communications"},{"key":"10.1016\/j.neunet.2026.109110_bib0013","doi-asserted-by":"crossref","first-page":"318","DOI":"10.1007\/s11633-022-1347-y","article-title":"A survey on recent advances and challenges in reinforcement learning methods for task-oriented dialogue policy learning","volume":"20","author":"Kwan","year":"2023","journal-title":"Machine Intelligence Research"},{"key":"10.1016\/j.neunet.2026.109110_bib0014","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.inffus.2022.03.003","article-title":"Exploration in deep reinforcement learning: A survey","volume":"85","author":"Ladosz","year":"2022","journal-title":"Information Fusion"},{"key":"10.1016\/j.neunet.2026.109110_bib0015","series-title":"Proceedings of the 8th international conference on learning representations","article-title":"Maxmin Q-learning: Controlling the estimation bias of Q-learning","author":"Lan","year":"2020"},{"key":"10.1016\/j.neunet.2026.109110_bib0016","unstructured":"Lee, K., Hwang, D., Park, S., Jang, Y., & Lee, M. (2024). Reinforcement learning from reflective feedback (RLRF): Aligning and improving LLMs via fine-grained self-reflection. arXiv: 2403.14238."},{"key":"10.1016\/j.neunet.2026.109110_bib0017","first-page":"1","article-title":"Eliminating primacy bias in online reinforcement learning by self-distillation","volume":"early access","author":"Li","year":"2024","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"10.1016\/j.neunet.2026.109110_bib0018","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2023.110558","article-title":"Multi-objective binary grey wolf optimization for feature selection based on guided mutation strategy","volume":"145","author":"Li","year":"2023","journal-title":"Applied Soft Computing"},{"key":"10.1016\/j.neunet.2026.109110_bib0019","unstructured":"Li, X., Lipton, Z. C., & Dhingra, B. (2017). A user simulator for task-completion dialogues. arXiv: 1612.05688v3."},{"key":"10.1016\/j.neunet.2026.109110_bib0020","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106101","article-title":"Coordination as inference in multi-agent reinforcement learning","volume":"172","author":"Li","year":"2024","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109110_bib0021","series-title":"Proceedings of the 56th annual meeting of the association for computational linguistics, melbourne, australia","first-page":"201","article-title":"Task-oriented dialogue system for automatic diagnosis","author":"Liu","year":"2018"},{"key":"10.1016\/j.neunet.2026.109110_bib0022","series-title":"Thirty-seventh AAAI conference on artificial intelligence","first-page":"11595","article-title":"Contrastive identity-aware learning for multi-agent value decomposition","volume":"vol. 37","author":"Liu","year":"2023"},{"key":"10.1016\/j.neunet.2026.109110_bib0023","series-title":"Proceedings of the 2021 conference on empirical methods in natural language processing","first-page":"4335","article-title":"DuRecDial 2.0: A bilingual parallel corpus for conversational recommendation","author":"Liu","year":"2021"},{"key":"10.1016\/j.neunet.2026.109110_bib0024","doi-asserted-by":"crossref","first-page":"1129","DOI":"10.1109\/TMM.2023.3276505","article-title":"Knowledge-enhanced causal reinforcement learning model for interactive recommendation","volume":"26","author":"Nie","year":"2024","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109110_bib0025","series-title":"Proceedings of the 56th annual meeting of the association for computational linguistics","first-page":"2182","article-title":"Deep dyna-Q : Integrating planning for task-completion dialogue policy learning","author":"Peng","year":"2018"},{"key":"10.1016\/j.neunet.2026.109110_bib0026","series-title":"Recursive introspection: Teaching language model agents how to self-improve","author":"Qu","year":"2024"},{"key":"10.1016\/j.neunet.2026.109110_bib0027","series-title":"Proceedings of the 60th annual meeting of the association for computational linguistics","first-page":"92","article-title":"Causal-aware safe policy improvement for task-oriented dialogue","author":"Ramachandran","year":"2022"},{"issue":"1","key":"10.1016\/j.neunet.2026.109110_bib0028","doi-asserted-by":"crossref","first-page":"246","DOI":"10.1007\/s12559-020-09769-7","article-title":"Towards sentiment-aware multi-modal dialogue policy learning","volume":"14","author":"Saha","year":"2020","journal-title":"Cognitive Computation"},{"key":"10.1016\/j.neunet.2026.109110_bib0029","series-title":"Proceedings of the international joint conference on neural networks","article-title":"Transfer learning based task-oriented dialogue policy for multiple domains using hierarchical reinforcement learning","author":"Saha","year":"2020"},{"key":"10.1016\/j.neunet.2026.109110_bib0030","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017). Proximal policy optimization algorithms. arXiv: 1707.06347."},{"key":"10.1016\/j.neunet.2026.109110_bib0031","series-title":"Proceedings of the 40th international conference on machine learning","article-title":"TGRL: An algorithm for teacher guided reinforcement learning","author":"Shenfeld","year":"2023"},{"key":"10.1016\/j.neunet.2026.109110_bib0032","unstructured":"Shih, F., & Liang, F. (2024). Fast value tracking for deep reinforcement learning. arXiv: 2403.13178."},{"key":"10.1016\/j.neunet.2026.109110_bib0033","unstructured":"Su, K., Zhou, S., Jiang, J., Gan, C., Wang, X., & Lu, Z. (2023). MA2QL: A minimalist approach to fully decentralized multi-agent reinforcement learning. arXiv: 2209.08244."},{"key":"10.1016\/j.neunet.2026.109110_bib0034","series-title":"Proceedings of the 58th annual meeting of the association for computational linguistics","first-page":"625","article-title":"Multi-agent task-oriented dialog policy learning with role-aware reward decomposition","author":"Takanobu","year":"2020"},{"key":"10.1016\/j.neunet.2026.109110_bib0035","series-title":"Conference on empirical methods in natural language processing and 9th international joint conference on natural language processing, proceedings of the conference","first-page":"100","article-title":"Guided dialog policy learning: Reward estimation for multi-domain task-oriented dialog","author":"Takanobu","year":"2019"},{"issue":"5","key":"10.1016\/j.neunet.2026.109110_bib0036","doi-asserted-by":"crossref","first-page":"4295","DOI":"10.1007\/s10462-022-10281-7","article-title":"Swarm intelligence algorithms for multiple unmanned aerial vehicles collaboration: A comprehensive review","volume":"56","author":"Tang","year":"2022","journal-title":"Artificial Intelligence Review"},{"key":"10.1016\/j.neunet.2026.109110_bib0037","series-title":"Proceedings of the 7th international conference on learning representations, ICLR","article-title":"Reward constrained policy optimization","author":"Tessler","year":"2019"},{"key":"10.1016\/j.neunet.2026.109110_bib0038","series-title":"Findings of the association for computational linguistics: NAACL","first-page":"565","article-title":"Anti-overestimation dialogue policy learning for task-completion dialogue system","author":"Tian","year":"2022"},{"issue":"3","key":"10.1016\/j.neunet.2026.109110_bib0039","doi-asserted-by":"crossref","first-page":"407","DOI":"10.1007\/s11747-010-0211-8","article-title":"Drivers of sales performance: A contemporary meta-analysis. have salespeople become knowledge brokers?","volume":"38","author":"Verbeke","year":"2010","journal-title":"Journal of the Academy of Marketing Science"},{"key":"10.1016\/j.neunet.2026.109110_bib0040","series-title":"Proceedings of the 2021 conference on empirical methods in natural language processing","first-page":"7882","article-title":"A collaborative multi-agent reinforcement learning framework for dialog action decomposition","author":"Wang","year":"2021"},{"issue":"8","key":"10.1016\/j.neunet.2026.109110_bib0041","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TKDE.2023.3289949","article-title":"Acquiring new knowledge without losing old ones for effective continual dialogue policy learning","volume":"14","author":"Wang","year":"2023","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"issue":"8","key":"10.1016\/j.neunet.2026.109110_bib0042","doi-asserted-by":"crossref","first-page":"10475","DOI":"10.1109\/TNNLS.2023.3242071","article-title":"A target-driven planning approach for goal-directed dialog systems","volume":"35","author":"Wang","year":"2024","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"10.1016\/j.neunet.2026.109110_bib0043","series-title":"Conference of the North American chapter of the association for computational linguistics: Human language technologies","first-page":"3319","article-title":"LUNA: Learning slot-turn alignment for dialogue state tracking","author":"Wang","year":"2022"},{"key":"10.1016\/j.neunet.2026.109110_bib0044","series-title":"Proceedings of the 30th ACM international conference on information & knowledge management","first-page":"4243","article-title":"Explore, filter and distill: Distilled reinforcement learning in recommendation","author":"Xie","year":"2021"},{"key":"10.1016\/j.neunet.2026.109110_bib0045","series-title":"Proceedings of the 2024 joint international conference on computational linguistics, language resources and evaluation","first-page":"4555","article-title":"Deep reinforcement learning-based dialogue policy with graph convolutional Q-network","author":"Xu","year":"2024"},{"key":"10.1016\/j.neunet.2026.109110_bib0046","series-title":"Proceedings of the 31st international conference on computational linguistics","first-page":"7331","article-title":"An efficient dialogue policy agent with model-based causal reinforcement learning","author":"Xu","year":"2025"},{"issue":"February","key":"10.1016\/j.neunet.2026.109110_bib0047","doi-asserted-by":"crossref","first-page":"243","DOI":"10.1016\/j.inffus.2023.02.009","article-title":"Causal inference multi-agent reinforcement learning for traffic signal control","volume":"94","author":"Yang","year":"2023","journal-title":"Information Fusion"},{"key":"10.1016\/j.neunet.2026.109110_bib0048","series-title":"Proceedings of the international joint conference on autonomous agents and multiagent systems","doi-asserted-by":"crossref","first-page":"2083","DOI":"10.65109\/QBLC7883","article-title":"Integrating independent and centralized multi-agent reinforcement learning for traffic signal network optimization","author":"Zhang","year":"2020"},{"key":"10.1016\/j.neunet.2026.109110_bib0049","series-title":"Proceedings of the 17th annual meeting of the special interest group on discourse and dialogue","first-page":"1","article-title":"Towards end-to-end learning for dialog state tracking and management using deep reinforcement learning","author":"Zhao","year":"2016"},{"issue":"5","key":"10.1016\/j.neunet.2026.109110_bib0050","doi-asserted-by":"crossref","first-page":"9676","DOI":"10.1609\/aaai.v34i05.6516","article-title":"Dynamic reward-based dueling deep dyna-Q: Robust policy learning in noisy environments","volume":"34","author":"Zhao","year":"2020","journal-title":"The Thirty-Fourth AAAI Conference on Artificial Intelligence"},{"key":"10.1016\/j.neunet.2026.109110_bib0051","doi-asserted-by":"crossref","first-page":"1380","DOI":"10.1109\/TASLP.2024.3357038","article-title":"Decomposed deep Q-network for coherent task-oriented dialogue policy learning","volume":"32","author":"Zhao","year":"2024","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"issue":"12","key":"10.1016\/j.neunet.2026.109110_bib0052","doi-asserted-by":"crossref","first-page":"8190","DOI":"10.1109\/TIV.2024.3408257","article-title":"Multiagent reinforcement learning: Methods, trustworthiness, applications in intelligent vehicles, and challenges","volume":"9","author":"Zhou","year":"2024","journal-title":"IEEE Transactions on Intelligent Vehicles"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005708?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005708?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,23]],"date-time":"2026-05-23T21:58:05Z","timestamp":1779573485000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026005708"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":52,"alternative-id":["S0893608026005708"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109110","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Collective reflection-based multi-agent reinforcement learning framework for task-oriented dialogue policy learning","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109110","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"109110"}}