{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T12:41:05Z","timestamp":1766061665746,"version":"3.48.0"},"reference-count":62,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iros60139.2025.11246902","type":"proceedings-article","created":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T18:54:45Z","timestamp":1764269685000},"page":"11685-11692","source":"Crossref","is-referenced-by-count":0,"title":["Policy Learning from Large Vision-Language Model Feedback Without Reward Modeling"],"prefix":"10.1109","author":[{"given":"Tung M.","family":"Luu","sequence":"first","affiliation":[{"name":"KAIST (Korea Advanced Institute of Science and Technology),School of Electrical Engineering,Daejeon,Republic of Korea"}]},{"given":"Donghoon","family":"Lee","sequence":"additional","affiliation":[{"name":"KAIST (Korea Advanced Institute of Science and Technology),School of Electrical Engineering,Daejeon,Republic of Korea"}]},{"given":"Younghwan","family":"Lee","sequence":"additional","affiliation":[{"name":"KAIST (Korea Advanced Institute of Science and Technology),School of Electrical Engineering,Daejeon,Republic of Korea"}]},{"given":"Chang D.","family":"Yoo","sequence":"additional","affiliation":[{"name":"KAIST (Korea Advanced Institute of Science and Technology),School of Electrical Engineering,Daejeon,Republic of Korea"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-019-1724-z"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1126\/science.add4679"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-020-2939-8"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-023-06419-4"},{"article-title":"Scalable deep reinforcement learning for vision-based robotic manipulation","volume-title":"Conference on Robot Learning","author":"Kalashnikov","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3069975"},{"article-title":"Towards human-level bimanual dexterous manipulation with reinforcement learning","volume-title":"Conference on Neural Information Processing Systems","author":"Chen","key":"ref7"},{"article-title":"Inverse reward design","volume-title":"Conference on Neural Information Processing Systems","author":"Hadfield-Menell","key":"ref8"},{"article-title":"Defining and characterizing reward gaming","volume-title":"Conference on Neural Information Processing Systems","author":"Skalse","key":"ref9"},{"article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","year":"2023","author":"Zhu","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.797"},{"article-title":"Autobench-v: Can large vision-language models benchmark themselves?","year":"2024","author":"Bao","key":"ref12"},{"article-title":"Openvla: An open-source vision-language-action model","volume-title":"Conference on Robot Learning","author":"Kim","key":"ref13"},{"article-title":"Code as reward: Empowering reinforcement learning with vlms","volume-title":"International Conference on Machine Learning","author":"Venuto","key":"ref14"},{"article-title":"Guiding pretraining in reinforcement learning with large language models","volume-title":"International Conference on Machine Learning","author":"Du","key":"ref15"},{"article-title":"Bootstrap your own skills: Learning to solve new tasks with large language model guidance","volume-title":"Conference on Robot Learning","author":"Zhang","key":"ref16"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford","key":"ref19"},{"article-title":"Roboclip: One demonstration is enough to learn robot policies","volume-title":"Conference on Neural Information Processing Systems","author":"Sontakke","key":"ref20"},{"article-title":"Vision-language models are zero-shot reward models for reinforcement learning","volume-title":"International Conference on Learning Representations","author":"Rocamonde","key":"ref21"},{"article-title":"Rl-vlm-f: Reinforcement learning from vision language foundation model feedback","volume-title":"International Conference on Machine Learning","author":"Wang","key":"ref22"},{"article-title":"Language to rewards for robotic skill synthesis","volume-title":"Conference on Robot Learning","author":"Yu","key":"ref23"},{"article-title":"Text2reward: Automated dense reward function generation for reinforcement learning","volume-title":"International Conference on Learning Representations","author":"Xie","key":"ref24"},{"key":"ref25","article-title":"Real-world offline reinforcement learning from vision language model feedback","author":"Venkataraman","year":"2024","journal-title":"CoRL-W"},{"article-title":"Deep reinforcement learning from human preferences","volume-title":"Conference on Neural Information Processing Systems","author":"Christiano","key":"ref26"},{"article-title":"B-pref: Benchmarking preference-based reinforcement learning","volume-title":"Conference on Neural Information Processing Systems","author":"Lee","key":"ref27"},{"article-title":"Inverse preference learning: Preference-based rl without a reward function","volume-title":"Conference on Neural Information Processing Systems","author":"Hejna","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.280"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1162\/coli.a.16"},{"article-title":"\" task success","volume-title":"is not enough: Investigating the use of video-language models as behavior critics for catching undesirable agent behaviors,\" Conference on Language Modeling","author":"Guan","key":"ref31"},{"article-title":"Non-markovian reward modelling from trajectory labels via interpretable multiple instance learning","volume-title":"Conference on Neural Information Processing Systems","author":"Early","key":"ref32"},{"article-title":"Preference transformer: Modeling human preferences using transformers for rl","volume-title":"International Conference on Learning Representations","author":"Kim","key":"ref33"},{"article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","year":"2024","author":"Reid","key":"ref34"},{"article-title":"Contrastive preference learning: learning from human feedback without rl","volume-title":"International Conference on Learning Representations","author":"Hejna","key":"ref35"},{"article-title":"Meta-world: A benchmark and evaluation for multi-task and meta reinforcement learning","volume-title":"Conference on Robot Learning","author":"Yu","key":"ref36"},{"article-title":"Minedojo: Building open-ended embodied agents with internet-scale knowledge","volume-title":"Conference on Neural Information Processing Systems","author":"Fan","key":"ref37"},{"article-title":"Liv: Language-image representations and rewards for robotic control","volume-title":"International Conference on Machine Learning","author":"Ma","key":"ref38"},{"article-title":"Enhancing rating-based reinforcement learning to effectively leverage feedback from large vision-language models","volume-title":"Forty-second International Conference on Machine Learning","author":"Luu","key":"ref39"},{"article-title":"Alvinn: An autonomous land vehicle in a neural network","volume-title":"Conference on Neural Information Processing Systems","author":"Pomerleau","key":"ref40"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.025"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3295255"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10802437"},{"article-title":"Algorithms for inverse reinforcement learning","volume-title":"International Conference on Machine Learning","author":"Ng","key":"ref44"},{"article-title":"Generative adversarial imitation learning","volume-title":"Conference on Neural Information Processing Systems","author":"Ho","key":"ref45"},{"article-title":"Nonlinear inverse reinforcement learning with gaussian processes","volume-title":"Conference on Neural Information Processing Systems","author":"Levine","key":"ref46"},{"article-title":"Guided cost learning: Deep inverse optimal control via policy optimization","volume-title":"International Conference on Machine Learning","author":"Finn","key":"ref47"},{"article-title":"Learning robust rewards with adversarial inverse reinforcement learning","volume-title":"International Conference on Learning Representations","author":"Fu","key":"ref48"},{"article-title":"Confidence-aware imitation learning from demonstrations with varying optimality","volume-title":"Conference on Neural Information Processing Systems","author":"Zhang","key":"ref49"},{"article-title":"Extrapolating beyond suboptimal demonstrations via inverse reinforcement learning from observations","volume-title":"International Conference on Machine Learning","author":"Brown","key":"ref50"},{"article-title":"Denoising diffusion probabilistic models","volume-title":"Conference on Neural Information Processing Systems","author":"Ho","key":"ref51"},{"article-title":"Mdsgen: Fast and efficient masked diffusion temporal-aware transformers for open-domain sound generation","volume-title":"International Conference on Learning Representations","author":"Pham","key":"ref52"},{"article-title":"Taro: Timestep-adaptive representation alignment with onset-aware conditioning for synchronized video-to-audio synthesis","volume-title":"International Conference on Computer Vision","author":"Ton","key":"ref53"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72946-1_27"},{"article-title":"Direct preference-based policy optimization without reward modeling","volume-title":"Conference on Neural Information Processing Systems","author":"An","key":"ref55"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.6161"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01249"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2016.0121"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2022.3186930"},{"article-title":"R3m: A universal visual representation for robot manipulation","volume-title":"Conference on Robot Learning","author":"Nair","key":"ref60"},{"article-title":"Offline reinforcement learning with implicit q-learning","volume-title":"International Conference on Learning Representations","author":"Kostrikov","key":"ref61"},{"article-title":"Rime: Robust preference-based reinforcement learning with noisy preferences","volume-title":"International Conference on Machine Learning","author":"Cheng","key":"ref62"}],"event":{"name":"2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","start":{"date-parts":[[2025,10,19]]},"location":"Hangzhou, China","end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11245651\/11245652\/11246902.pdf?arnumber=11246902","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T12:37:35Z","timestamp":1766061455000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11246902\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":62,"URL":"https:\/\/doi.org\/10.1109\/iros60139.2025.11246902","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}