{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T18:40:33Z","timestamp":1771699233424,"version":"3.50.1"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2007,1,10]],"date-time":"2007-01-10T00:00:00Z","timestamp":1168387200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Auton Agent Multi-Agent Syst"],"published-print":{"date-parts":[[2007,8,16]]},"DOI":"10.1007\/s10458-006-9010-5","type":"journal-article","created":{"date-parts":[[2007,1,9]],"date-time":"2007-01-09T14:36:42Z","timestamp":1168353402000},"page":"197-220","source":"Crossref","is-referenced-by-count":29,"title":["Shaping multi-agent systems with gradient reinforcement learning"],"prefix":"10.1007","volume":"15","author":[{"given":"Olivier","family":"Buffet","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alain","family":"Dutech","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fran\u00e7ois","family":"Charpillet","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2007,1,10]]},"reference":[{"issue":"2\u20133","key":"9010_CR1","first-page":"279","volume":"23","author":"M. Asada","year":"1996","unstructured":"Asada M., Noda S., Tawaratsumida S., Hosodaal K. (1996). Purposive behavior acquisition for a real robot by vision-based reinforcement learning. Machine Learning 23(2\u20133): 279\u2013303","journal-title":"Machine Learning"},{"key":"9010_CR2","unstructured":"Bartlett P., Baxter J. (1999). Hebbian synaptic modifications in spiking neurons that learn. Technical report, Australian National University"},{"key":"9010_CR3","doi-asserted-by":"crossref","first-page":"319","DOI":"10.1613\/jair.806","volume":"15","author":"J. Baxter","year":"2001","unstructured":"Baxter J., Bartlett P. (2001). Infinite-horizon policy-gradient estimation. Journal of Artificial Intelligence Research 15, 319\u2013350","journal-title":"Journal of Artificial Intelligence Research"},{"key":"9010_CR4","doi-asserted-by":"crossref","first-page":"351","DOI":"10.1613\/jair.807","volume":"15","author":"J. Baxter","year":"2001","unstructured":"Baxter J., Bartlett P., Weaver L. (2001). Experiments with infinite-horizon, policy-gradient estimation. Journal of Artificial Intelligence Research 15, 351\u2013381","journal-title":"Journal of Artificial Intelligence Research"},{"issue":"4","key":"9010_CR5","doi-asserted-by":"crossref","first-page":"819","DOI":"10.1287\/moor.27.4.819.297","volume":"27","author":"D. Bernstein","year":"2002","unstructured":"Bernstein D., Givan R., Immerman N., Zilberstein S. (2002). The complexity of decentralized control of Markov decision processes. Mathematics of Operations Research 27(4): 819\u2013840","journal-title":"Mathematics of Operations Research"},{"key":"9010_CR6","unstructured":"Bertsekas, D., & Tsitsiklis, J. (1996). Neurodynamic programming. Athena Scientific."},{"key":"9010_CR7","unstructured":"Boutilier, C. (1996). Planning, learning and coordination in multiagent decision processes. In Y. Shoham (Ed.), Proceedings of the sixth conference on theoretical aspects of rationality and knowledge (TARK \u201996) (pp. 195\u2013210)."},{"key":"9010_CR8","unstructured":"Buffet, O. (2003). Une double approche modulaire de l\u2019apprentissage par renforcement pour des agents intelligents adaptatifs. Ph.D. thesis, Universit\u00e9 Henri Poincar\u00e9, Nancy 1. Laboratoire Lorrain de recherche en informatique et ses applications (LORIA)."},{"key":"9010_CR9","unstructured":"Buffet, O., & Aberdeen, D. (2006). The factored policy gradient planner (IPC-06 Version). In A. Gerevini, B. Bonet, & B. Givan (Eds.), Proceedings of the fifth international planning competition (IPC-5) (pp. 69\u201371). Winner, probabilistic track of the 5th International Planning Competition."},{"key":"9010_CR10","unstructured":"Buffet, O., Dutech, A., & Charpillet, F. (2004). Self-growth of basic behaviors in an action selection based agent. In S. Schaal, A. Ijspeert, A. Billard, S. Vijayakumar, J. Hallam, & J.-A. Meyer (Eds.), From animals to animats 8: Proceedings of the eighth international conference on simulation of adaptive behavior (SAB\u201904) (pp. 223\u2013232)."},{"issue":"4\u20135","key":"9010_CR11","doi-asserted-by":"crossref","first-page":"603","DOI":"10.3166\/ria.19.603-632","volume":"19","author":"O. Buffet","year":"2005","unstructured":"Buffet O., Dutech A., Charpillet F. (2005). D\u00e9veloppement autonome des comportements de base d\u2019un agent. Revue d\u2019Intelligence Artificielle, 19(4\u20135): 603\u2013632","journal-title":"Revue d\u2019Intelligence Artificielle,"},{"key":"9010_CR12","unstructured":"Carmel, D., & Markovitch, S. (1996). Adaption and learning in multi-agent systems, Vol. 1042, Lecture notes in artificial intelligence, Chapt. Opponent modeling in multi-agent systems (pp. 40\u201352). Springer-Verlag."},{"key":"9010_CR13","unstructured":"Cassandra, A. R. (1998). Exact and approximate algorithms for partially observable Markov decision processes. Ph.D. thesis, Brown University, Department of Computer Science, Providence, RI."},{"key":"9010_CR14","doi-asserted-by":"crossref","unstructured":"Dorigo, M., & Di Caro, G. (1999). Ant colony optimization: A new meta-heuristic. In P. Angeline, Z. Michalewicz, M. Schoenauer, X. Yao, & A. Zalzala (Eds.), Proceedings of the congress on evolutionary computation (CEC-99) (pp. 1470\u20131477).","DOI":"10.1109\/CEC.1999.782657"},{"key":"9010_CR15","unstructured":"Dutech, A. (2000). Solving POMDP using selected past-events. In W. Horn (Ed.), Proceedings of the fourteenth european conference on artificial intelligence (ECAI\u201900) (pp. 281\u2013285)."},{"issue":"4","key":"9010_CR16","first-page":"217","volume":"16","author":"F. Fern\u00e1ndez","year":"2001","unstructured":"Fern\u00e1ndez F., Parker L. (2001). Learning in large cooperative multi-robot domains. International Journal of Robotics and Automation 16(4): 217\u2013226","journal-title":"International Journal of Robotics and Automation"},{"issue":"9","key":"9010_CR17","doi-asserted-by":"crossref","first-page":"939","DOI":"10.1177\/0278364904045564","volume":"23","author":"B. Gerkey","year":"2004","unstructured":"Gerkey B., Matari\u0107 M. (2004). A formal analysis and taxonomy of task allocation in multi-robot systems. International Journal of Robotics Research 23(9): 939\u2013954","journal-title":"International Journal of Robotics Research"},{"key":"9010_CR18","unstructured":"Gmytrasiewicz, P., & Doshi, P. (2004). Interactive POMDPs: Properties and preliminary results. In Proceedings of the third international joint conference on autonomous agents and multi-agent systems (AAMAS\u201904)."},{"key":"9010_CR19","unstructured":"Goldman, C., Allen, M., & Zilberstein, S. (2004). Decentralized language learning through acting. In Proceedings of the third international joint conference on autonomous agents and multi-agent systems (AAMAS\u201904)."},{"key":"9010_CR20","unstructured":"Hengst, B. (2002). Discovering hierarchy in reinforcement learning with HEXQ. In C. Sammut & A. G. Hoffmann (Eds.), Proceedings of the nineteenth international conference on machine learning (ICML\u201902) (pp. 243\u2013250)."},{"key":"9010_CR21","doi-asserted-by":"crossref","unstructured":"Hu, J., & Wellman, M. (1998). Online learning about other agents in a dynamic multiagent system. In K. P. Sycara & M. Wooldridge (Eds.), Proceedings of the second international conference on autonomous agents (Agents\u201998) (pp. 239\u2013246).","DOI":"10.1145\/280765.280839"},{"issue":"6","key":"9010_CR22","doi-asserted-by":"crossref","first-page":"1186","DOI":"10.1162\/neco.1994.6.6.1185","volume":"6","author":"T. Jaakkola","year":"1994","unstructured":"Jaakkola T., Jordan M., Singh S. (1994). On the convergence of stochastic iterative dynamic programming algorithms. Neural Computation 6(6): 1186\u20131201","journal-title":"Neural Computation"},{"key":"9010_CR23","unstructured":"Jong, E. D. (2000). Attractors in the development of communication. In J.-A. Meyer, A. Berthoz, D. Floreano, H. L. Roitblat, & S. W. Wilson (Eds.), From animals to animats 6: Proceedings of the sixth international conference on simulation of adaptive behavior (SAB-00)."},{"key":"9010_CR24","doi-asserted-by":"crossref","first-page":"237","DOI":"10.1613\/jair.301","volume":"4","author":"L. Kaelbling","year":"1996","unstructured":"Kaelbling L., Littman M., Moore A. (1996). Reinforcement learning: A survey. Journal of Artificial Intelligence Research 4, 237\u2013285","journal-title":"Journal of Artificial Intelligence Research"},{"key":"9010_CR25","unstructured":"Laud, A. (2004). Theory and application of reward shaping in reinforcement learning. Ph.D. thesis, University of Illinois at Urbana-Champaign."},{"key":"9010_CR26","doi-asserted-by":"crossref","unstructured":"Littman, M., Cassandra, A., & Kaelbling, L. (1995). Learning policies for partially observable environments: Scaling up. In A. Prieditis & S. J. Russell (Eds.), Proceedings of the twelveth international conference on machine learning (ICML\u201995) (pp. 362\u2013370).","DOI":"10.1016\/B978-1-55860-377-6.50052-9"},{"issue":"1","key":"9010_CR27","doi-asserted-by":"crossref","first-page":"73","DOI":"10.1023\/A:1008819414322","volume":"4","author":"M. Matari\u0107","year":"1997","unstructured":"Matari\u0107 M. (1997). Reinforcement learning in the multi-robot domain. Autonomous Robots 4(1): 73\u201383","journal-title":"Autonomous Robots"},{"key":"9010_CR28","unstructured":"McCallum, R. A. (1995). Reinforcement learning with selective perception and hidden state. Ph.D. thesis, University of Rochester."},{"key":"9010_CR29","unstructured":"Ng, A., Harada, D., & Russell, S. (1999). Policy invariance under reward transformations: Theory and application to reward shaping. In I. Bratko & S. Dzeroski (Eds.), Proceedings of the sixteenth international conference on machine learning (ICML\u201999) (pp. 278\u2013287)."},{"key":"9010_CR30","unstructured":"Peshkin, L., Kim, K., Meuleau, N., & Kaelbling, L. (2000). Learning to cooperate via policy search. In C. Boutilier & M. Goldszmidt (Eds.), Proceedings of the sixteenth conference on uncertainty in artificial intelligence (UAI\u201900) (pp. 489\u2013496)."},{"key":"9010_CR31","doi-asserted-by":"crossref","unstructured":"Peters, J., Vijayakumar, S., & Schaal, S. (2005). Natural actor-critic. In J. Gama, R. Camacho, P. Brazdil, A. Jorge, & L. Torgo (Eds.), Proceedings of the sixteenth european conference on machine learning (ECML\u201905), Vol. 3720, Lecture notes in computer science.","DOI":"10.1007\/11564096_29"},{"key":"9010_CR32","doi-asserted-by":"crossref","unstructured":"Puterman, M. L. (1994). Markov decision processes\u2014Discrete stochastic dynamic programming. New York, USA: Wiley.","DOI":"10.1002\/9780470316887"},{"key":"9010_CR33","doi-asserted-by":"crossref","first-page":"389","DOI":"10.1613\/jair.1024","volume":"16","author":"D. Pynadath","year":"2002","unstructured":"Pynadath D., Tambe M. (2002). The communicative multiagent team decision problem: Analyzing teamwork theories and models. Journal of Artificial Intelligence Research 16: 389\u2013423","journal-title":"Journal of Artificial Intelligence Research"},{"key":"9010_CR34","unstructured":"Randl\u00f8v, J. (2000). Shaping in reinforcement learning by changing the physics of the problem. In P. Langley (Ed.), Proceedings of the seventeenth international conference on machine learning (ICML\u201900) (pp. 767\u2013774)."},{"key":"9010_CR35","unstructured":"Randl\u00f8v, J., & Alstr\u00f8m, P. (1998). Learning to drive a bicycle using reinforcement learning and shaping. In J. W. Shavlik (Ed.), Proceedings of the fifteenth international conference on machine learning (ICML\u201998) (pp. 463\u2013471)."},{"key":"9010_CR36","unstructured":"Rogowski, C. (2004). Model-based opponent modelling in domains beyond the prisoner\u2019s dilemma. In Proceedings of modeling other agents from observations (MOO 2004), AAMAS\u201904 workshop."},{"key":"9010_CR37","doi-asserted-by":"crossref","first-page":"263","DOI":"10.1023\/A:1007570708568","volume":"33","author":"R. Salustowicz","year":"1998","unstructured":"Salustowicz R., Wiering M., Schmidhuber J. (1998). Learning team strategies: Soccer case studies. Machine Learning 33, 263\u2013282","journal-title":"Machine Learning"},{"key":"9010_CR38","unstructured":"Shoham, Y., Powers, R., & Grenager, T. (2003). Multi-agent reinforcement learning: A critical survey. Technical report, Stanford."},{"key":"9010_CR39","doi-asserted-by":"crossref","unstructured":"Singh, S., Jaakkola, T., & Jordan, M. (1994). Learning without state estimation in partially observable Markovian decision processes. In W. W. Cohen & H. Hirsh (Eds.), Proceedings of the eleventh international conference on machine learning (ICML\u201994).","DOI":"10.1016\/B978-1-55860-335-6.50042-8"},{"key":"9010_CR40","volume-title":"Science and human behavior","author":"B. Skinner","year":"1953","unstructured":"Skinner B. (1953). Science and human behavior. New-York, Collier-Macmillian"},{"key":"9010_CR41","unstructured":"Staddon, J. (1983). Adaptative behavior and learning. Cambridge University Press."},{"key":"9010_CR42","doi-asserted-by":"crossref","unstructured":"Stone, P., & Veloso, M. (2000a). Layered learning. In R. L. de M\u00e1ntaras & E. Plaza (Eds.), Proceedings of the eleventh european conference on machine learning (ECML\u201900), Vol. 1810, Lecture notes in computer science.","DOI":"10.1007\/3-540-45164-1_38"},{"key":"9010_CR43","unstructured":"Stone, P., & Veloso, M. (2000b). Multiagent systems: A survey from a machine learning perspective. Autonomous Robotics, 8(3)."},{"key":"9010_CR44","doi-asserted-by":"crossref","unstructured":"Stone, P., & Veloso, M. (2000c). Team-partitioned, opaque-transition reinforcement learning. In Proceedings of the third international conference on autonomous agents (Agents\u201900).","DOI":"10.1145\/301136.301195"},{"key":"9010_CR45","unstructured":"Sutton, R., & Barto, G. (1998). Reinforcement learning: An introduction. Cambridge, MA: Bradford Book, MIT Press."},{"key":"9010_CR46","unstructured":"Sutton, R., McAllester, D., Singh, S., & Mansour, Y. (1999). Policy gradient methods for reinforcement learning with function approximation. In S. A. Solla, T. K. Leen, & K.-R. M\u00fcller (Eds.), Advances in neural information processing systems 11 (NIPS\u201999), Vol. 12 (pp. 1057\u20131063)."},{"key":"9010_CR47","unstructured":"Tumer, K., & Wolpert, D. (2000). Collective intelligence and Braess paradox. In Proceedings of the sixteenth national conference on artificial intelligence (AAAI\u201900) (pp. 104\u2013109)."},{"key":"9010_CR48","unstructured":"Tyrrell, T. (1993). Computational mechanisms for action selection. Ph.D. thesis, University of Edinburgh."},{"key":"9010_CR49","unstructured":"Vidal, J., & Durfee, E. (1997). Agent learning about agents: A framework and analysis. In S. Sen (Ed.), Collected papers from the AAAI-97 workshop on multiagent learning (pp. 71\u201376)."},{"key":"9010_CR50","unstructured":"Watkins, C. (1989). Learning from delayed rewards. Ph.D. thesis, King\u2019s College of Cambridge, UK."},{"key":"9010_CR51","unstructured":"Wolpert, D., & Tumer, K. (1999). An introduction to collective intelligence. Technical Report NASA-ARC-IC-99-63, NASA AMES Research Center."},{"key":"9010_CR52","doi-asserted-by":"crossref","unstructured":"Wolpert, D., Wheeler, K., & Tumer, K. (1999). General principles of learning-based multi-agent systems. In Proceedings of the third international conference on autonomous agents (Agents\u201999) (pp. 77\u201383).","DOI":"10.1145\/301136.301167"},{"key":"9010_CR53","unstructured":"Xuan, P., Lesser, V., & Zilberstein, S. (2000). Communication in multi-agent Markov decision processes. In S. Parsons & P. Gmytrasiewicz (Eds.), Proceedings of ICMAS workshop on game theoretic and decision theoretic agents."}],"container-title":["Autonomous Agents and Multi-Agent Systems"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10458-006-9010-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10458-006-9010-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10458-006-9010-5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,8,7]],"date-time":"2021-08-07T11:50:25Z","timestamp":1628337025000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10458-006-9010-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2007,1,10]]},"references-count":53,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2007,8,16]]}},"alternative-id":["9010"],"URL":"https:\/\/doi.org\/10.1007\/s10458-006-9010-5","relation":{},"ISSN":["1387-2532","1573-7454"],"issn-type":[{"value":"1387-2532","type":"print"},{"value":"1573-7454","type":"electronic"}],"subject":[],"published":{"date-parts":[[2007,1,10]]}}}