{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T16:09:27Z","timestamp":1770739767273,"version":"3.49.0"},"reference-count":56,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62136008"],"award-info":[{"award-number":["62136008"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62173324"],"award-info":[{"award-number":["62173324"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"International Partnership Program of Chinese Academy of Sciences","award":["104GJHZ2022013GC"],"award-info":[{"award-number":["104GJHZ2022013GC"]}]},{"name":"Suzhou Innovation and Entrepreneurship Leading Talents Program\u2013Innovation Leading Talent in Universities and Research Institutes","award":["ZXL2025310"],"award-info":[{"award-number":["ZXL2025310"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Syst. Man Cybern, Syst."],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1109\/tsmc.2025.3638818","type":"journal-article","created":{"date-parts":[[2025,12,17]],"date-time":"2025-12-17T18:47:31Z","timestamp":1765997251000},"page":"893-905","source":"Crossref","is-referenced-by-count":0,"title":["TeViR: Text-to-Video Reward With Diffusion Models for Efficient Reinforcement Learning"],"prefix":"10.1109","volume":"56","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1217-5877","authenticated-orcid":false,"given":"Yuhui","family":"Chen","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2559-9585","authenticated-orcid":false,"given":"Haoran","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"given":"Zhennan","family":"Jiang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1901-7876","authenticated-orcid":false,"given":"Haowei","family":"Wen","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8218-9633","authenticated-orcid":false,"given":"Dongbin","family":"Zhao","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1","article-title":"VIP: Towards universal visual reward and representation via value-implicit pre-training","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Ma"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2020.2975232"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2023.3270444"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2025.XXI.019"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2019.XV.073"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2020.3018325"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2927869"},{"key":"ref8","first-page":"1","article-title":"Reward design with language models","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Kwon"},{"key":"ref9","article-title":"Survey of vision-language-action models for embodied manipulation","author":"Li","year":"2025","journal-title":"arXiv:2508.15201"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.artint.2021.103500"},{"key":"ref11","article-title":"Concrete problems in AI safety","author":"Amodei","year":"2016","journal-title":"arXiv:1606.06565"},{"key":"ref12","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn. (ICML)","volume":"139","author":"Radford"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610421"},{"key":"ref14","first-page":"1","article-title":"RoboCLIP: One demonstration is enough to learn robot policies","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Sontakke"},{"key":"ref15","first-page":"1","article-title":"RL-VLM-F: Reinforcement learning from vision language foundation model feedback","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref16","first-page":"8633","article-title":"Video diffusion models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ho"},{"key":"ref17","article-title":"Imagen video: High definition video generation with diffusion models","author":"Ho","year":"2022","journal-title":"arXiv:2210.02303"},{"key":"ref18","first-page":"1","article-title":"Learning universal policies via text-guided video generation","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Du"},{"key":"ref19","first-page":"1","article-title":"Learning to act from actionless videos through dense correspondences","volume-title":"Proc. ICLR","author":"Ko"},{"key":"ref20","first-page":"1","article-title":"Learning interactive real-world simulators","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Yang"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.092"},{"key":"ref22","article-title":"Foundation reinforcement learning: Towards embodied generalist agents with foundation prior assistance","author":"Ye","year":"2023","journal-title":"arXiv:2310.02635"},{"key":"ref23","first-page":"1","article-title":"Video prediction models as rewards for reinforcement learning","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Escontrela"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72946-1_27"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TAI.2024.3387401"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1186\/s41235-018-0120-9"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2024.3383158"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.65109\/ckwc1742"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2021.3098451"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2023.3295255"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2022.XVIII.010"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2023.3248324"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/1015330.1015430"},{"key":"ref34","first-page":"49","article-title":"Guided cost learning: Deep inverse optimal control via policy optimization","volume-title":"Proc. Int. Conf. Mach. Learn. ICML","author":"Finn"},{"key":"ref35","first-page":"537","article-title":"XIRL: Cross-embodiment inverse reinforcement learning","volume-title":"Proc. Conf. Robot., Learn.","volume":"164","author":"Zakka"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2017.XIII.050"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2021.xvii.012"},{"key":"ref38","first-page":"1","article-title":"Can pre-trained text-to-image models generate visual goals for reinforcement learning?","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Gao"},{"key":"ref39","first-page":"13584","article-title":"Language instructed reinforcement learning for human-AI coordination","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","volume":"202","author":"Hu"},{"key":"ref40","first-page":"374","article-title":"Language to rewards for robotic skill synthesis","volume-title":"Proc. Conf. Robot Learn. (CoRL)","volume":"229","author":"Yu"},{"key":"ref41","first-page":"1","article-title":"RoboGen: Towards unleashing infinite data for automated robot learning via generative simulation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref42","first-page":"1","article-title":"Eureka: Human-level reward design via coding large language models","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Ma"},{"key":"ref43","first-page":"120","article-title":"Vision-language models as success detectors","volume-title":"Proc. Conf. Lifelong Learn. Agents","volume":"232","author":"Du"},{"key":"ref44","first-page":"8657","article-title":"Guiding pretraining in reinforcement learning with large language models","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","volume":"202","author":"Du"},{"key":"ref45","first-page":"1","article-title":"Vision-language models are zero-shot reward models for reinforcement learning","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Rocamonde"},{"key":"ref46","first-page":"23301","article-title":"LIV: Language-image representations and rewards for robotic control","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ma"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2024.3392930"},{"key":"ref48","article-title":"Zero-shot robotic manipulation with pretrained image-editing diffusion models","author":"Black","year":"2023","journal-title":"arXiv:2310.10639"},{"key":"ref49","first-page":"1","article-title":"Denoising diffusion implicit models","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Song"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref51","first-page":"1","article-title":"Exploration by random network distillation","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Burda"},{"key":"ref52","first-page":"1094","article-title":"Meta-world: A benchmark and evaluation for multi-task and meta reinforcement learning","volume-title":"Proc. Conf. Robot Learn. (CoRL)","volume":"100","author":"Yu"},{"key":"ref53","first-page":"1","article-title":"Mastering visual continuous control: Improved data-augmented reinforcement learning","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Yarats"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1126\/scirobotics.ads5033"},{"key":"ref55","first-page":"1","article-title":"ControlVideo: Training-free controllable text-to-video generation","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Zhang"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2024.3487535"}],"container-title":["IEEE Transactions on Systems, Man, and Cybernetics: Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6221021\/11372519\/11301638.pdf?arnumber=11301638","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T21:07:30Z","timestamp":1770671250000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11301638\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":56,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tsmc.2025.3638818","relation":{},"ISSN":["2168-2216","2168-2232"],"issn-type":[{"value":"2168-2216","type":"print"},{"value":"2168-2232","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2]]}}}