{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:20:18Z","timestamp":1777890018498,"version":"3.51.4"},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U23A20310"],"award-info":[{"award-number":["U23A20310"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00762","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"8133-8143","source":"Crossref","is-referenced-by-count":0,"title":["Training-Free Generation of Temporally Consistent Rewards from VLMs"],"prefix":"10.1109","author":[{"given":"Yinuo","family":"Zhao","sequence":"first","affiliation":[{"name":"Beijing Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiale","family":"Yuan","sequence":"additional","affiliation":[{"name":"Taobao &#x0026; Tmall Group of Alibaba"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiyuan","family":"Xu","sequence":"additional","affiliation":[{"name":"Beijing Innovation Center of Humanoid Robotics"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoshuai","family":"Hao","sequence":"additional","affiliation":[{"name":"Beijing Academy of Artificial Intelligence"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinyi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kun","family":"Wu","sequence":"additional","affiliation":[{"name":"Beijing Innovation Center of Humanoid Robotics"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengping","family":"Che","sequence":"additional","affiliation":[{"name":"Beijing Innovation Center of Humanoid Robotics"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chi Harold","family":"Liu","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jian","family":"Tang","sequence":"additional","affiliation":[{"name":"Beijing Innovation Center of Humanoid Robotics"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref2","article-title":"Vision-language models as a source of rewards","volume-title":"Second Agent Learning in Open-Endedness Workshop","author":"Baumli","year":"2023"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"ref4","article-title":"Towards a unified agent with foundation models","volume-title":"Workshop on Reincarnating Reinforcement Learning at ICLR 2023","author":"Di Palo","year":"2023"},{"key":"ref5","article-title":"Video language planning","volume-title":"International Conference on Learning Representations","author":"Du","year":"2024"},{"key":"ref6","article-title":"Manipulateanything: Automating real-world robots using visionlanguage models","volume-title":"8th Annual Conference on Robot Learning","author":"Duan","year":"2024"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610090"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10802284"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/wacvw58289.2023.00042"},{"key":"ref10","first-page":"540","article-title":"Voxposer: Composable 3d value maps for robotic manipulation with language models","volume-title":"Conference on Robot Learning","author":"Huang","year":"2023"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00168"},{"key":"ref12","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"International Conference on Machine Learning","author":"Jia","year":"2021"},{"key":"ref13","article-title":"Decomposed prompting: A modular approach for solving complex tasks","volume-title":"The Eleventh International Conference on Learning Representations","author":"Khot","year":"2023"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9341579"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1177\/02783649251390579"},{"key":"ref17","first-page":"1973019742","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International Conference on Machine Learning","author":"Li","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref19","first-page":"3468","article-title":"Reflect: Summarizing robot experiences for failure explanation and correction","volume-title":"Conference on Robot Learning","author":"Liu","year":"2023"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/icra55743.2025.11127622"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3180108"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-024-69901-7"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.302"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/icra55743.2025.11127585"},{"key":"ref25","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford","year":"2021"},{"key":"ref26","article-title":"Sam 2: Segment anything in images and videos","author":"Ravi","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref27","article-title":"Vision-language models are zeroshot reward models for reinforcement learning","volume-title":"International Conference on Learning Representations","author":"Rocamonde","year":"2024"},{"key":"ref28","article-title":"Plan diffuser: Grounding LLM planners with diffusion models for robotic manipulation","author":"Sharan","year":"2024","journal-title":"Bridging the Gap between Cognitive Science and Robot Learning in the Real World: Progresses and New Directions"},{"key":"ref29","first-page":"894","article-title":"Cliport: What and where pathways for robotic manipulation","volume-title":"Conference on robot learning","author":"Shridhar","year":"2022"},{"key":"ref30","first-page":"785","article-title":"Perceiveractor: A multi-task transformer for robotic manipulation","volume-title":"Conference on Robot Learning","author":"Shridhar","year":"2023"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"ref32","article-title":"Reason-rft: Reinforcement fine-tuning for visual reasoning","author":"Tan","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00285"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/IROS60139.2025.11245995"},{"key":"ref35","article-title":"Real-world offline reinforcement learning from vision language model feedback","author":"Venkataraman","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref36","article-title":"Code as reward: Empowering reinforcement learning with vlms","volume-title":"International Conference on Machine Learning","author":"Venuto","year":"2024"},{"key":"ref37","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref38","article-title":"Rl-vlm-f: Reinforcement learning from vision language foundation model feedback","volume-title":"International Conference on Machine Learning","author":"Wang","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2024.xx.092"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2025.xxi.152"},{"key":"ref41","article-title":"Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v","author":"Yang","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref42","first-page":"48044811","article-title":"Robot finetuning made easy: Pre-training rewards and policies for autonomous real-world reinforcement learning","volume-title":"International Conference on Robotics and Automation","author":"Yang","year":"2024"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3216985"},{"key":"ref44","first-page":"148","article-title":"Sornet: Spatial object-centric representations for sequential manipulation","volume-title":"Conference on Robot Learning","author":"Yuan","year":"2022"},{"key":"ref45","article-title":"Robopoint: A vision-language model for spatial affordance prediction in robotics","volume-title":"8th Annual Conference on Robot Learning","author":"Yuan","year":"2024"},{"key":"ref46","article-title":"Sam-e: Leveraging visual foundation model with sequence imitation for embodied manipulation","volume-title":"International Conference on Machine Learning","author":"Zhang","year":"2024"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.638"},{"key":"ref48","article-title":"Tracevla: Visual trace prompting enhances spatial-temporal awareness for generalist robotic policies","author":"Zheng","year":"2024","journal-title":"arXiv preprint arXiv"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11443523.pdf?arnumber=11443523","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:25:02Z","timestamp":1777613102000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11443523\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00762","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}