{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T09:52:46Z","timestamp":1743155566332,"version":"3.40.3"},"publisher-location":"Cham","reference-count":63,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729973"},{"type":"electronic","value":"9783031729980"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72998-0_10","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T18:01:58Z","timestamp":1727632918000},"page":"163-180","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Adapt2Reward: Adapting Video-Language Models to\u00a0Generalizable Robotic Rewards via\u00a0Failure Prompts"],"prefix":"10.1007","author":[{"given":"Yanting","family":"Yang","sequence":"first","affiliation":[]},{"given":"Minghao","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Qibo","family":"Qiu","sequence":"additional","affiliation":[]},{"given":"Jiahao","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Wenxiao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Binbin","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Ziyu","family":"Guan","sequence":"additional","affiliation":[]},{"given":"Xiaofei","family":"He","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"10_CR1","doi-asserted-by":"crossref","unstructured":"Abbeel, P., Ng, A.: Apprenticeship learning via inverse reinforcement learning. In: Proceedings of the Twenty-First International Conference on Machine Learning (2004)","DOI":"10.1145\/1015330.1015430"},{"key":"10_CR2","unstructured":"Babaeizadeh, M., Saffar, M.T., Nair, S., Levine, S., Finn, C., Erhan, D.: FitVid: overfitting in pixel-level video prediction. ArXiv abs\/2106.13195 (2021)"},{"key":"10_CR3","unstructured":"Brohan, A., et al.: RT-2: vision-language-action models transfer web knowledge to robotic control. ArXiv abs\/2307.15818 (2023)"},{"key":"10_CR4","unstructured":"Brohan, A., et al.: RT-1: robotics transformer for real-world control at scale. ArXiv abs\/2212.06817 (2022)"},{"key":"10_CR5","doi-asserted-by":"crossref","unstructured":"Chen, A.S., Nair, S., Finn, C.: Learning generalizable robotic reward functions from \u201cin-the-wild\u201d human videos. ArXiv abs\/2103.16817 (2021)","DOI":"10.15607\/RSS.2021.XVII.012"},{"key":"10_CR6","unstructured":"Das, N., Bechtle, S., Davchev, T., Jayaraman, D., Rai, A., Meier, F.: Model-based inverse reinforcement learning from visual demonstrations. In: Conference on Robot Learning, pp. 1930\u20131942. PMLR (2021)"},{"key":"10_CR7","unstructured":"Du, Y., et al.: Vision-language models as success detectors. ArXiv abs\/2303.07280 (2023)"},{"key":"10_CR8","unstructured":"Ebert, F., Finn, C., Dasari, S., Xie, A., Lee, A.X., Levine, S.: Visual foresight: model-based deep reinforcement learning for vision-based robotic control. ArXiv abs\/1812.00568 (2018)"},{"key":"10_CR9","unstructured":"Fan, L.J., et al.: MineDojo: building open-ended embodied agents with internet-scale knowledge. ArXiv abs\/2206.08853 (2022)"},{"key":"10_CR10","unstructured":"Finn, C., Levine, S., Abbeel, P.: Guided cost learning: deep inverse optimal control via policy optimization. In: International Conference on Machine Learning (2016)"},{"key":"10_CR11","unstructured":"Fu, J., Luo, K., Levine, S.: Learning robust rewards with adverserial inverse reinforcement learning. In: International Conference on Learning Representations (2018)"},{"key":"10_CR12","unstructured":"Fu, J., Singh, A., Ghosh, D., Yang, L., Levine, S.: Variational inverse control with events: a general framework for data-driven reward definition. In: Neural Information Processing Systems (2018)"},{"key":"10_CR13","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 5843\u20135851 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"10_CR14","unstructured":"Hafner, D., et al.: Learning latent dynamics for planning from pixels. In: International Conference on Machine Learning, pp. 2555\u20132565. PMLR (2019)"},{"key":"10_CR15","unstructured":"Jain, A., Hu, M., Ratliff, N.D., Bagnell, D., Zinkevich, M.A.: Maximum margin planning. In: Proceedings of the 23rd International Conference on Machine Learning (2006)"},{"key":"10_CR16","unstructured":"Jang, E., et al.: BC-Z: zero-shot task generalization with robotic imitation learning. ArXiv abs\/2202.02005 (2022)"},{"key":"10_CR17","unstructured":"Kwon, M., Xie, S.M., Bullard, K., Sadigh, D.: Reward design with language models. ArXiv abs\/2303.00001 (2023)"},{"key":"10_CR18","doi-asserted-by":"crossref","unstructured":"Lee, J., Ryoo, M.S.: Learning robot activities from first-person human videos using convolutional future regression. In: 2017 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 1497\u20131504 (2017)","DOI":"10.1109\/IROS.2017.8205953"},{"key":"10_CR19","doi-asserted-by":"publisher","first-page":"1323","DOI":"10.1016\/j.robot.2013.08.003","volume":"61","author":"K Lee","year":"2013","unstructured":"Lee, K., Su, Y., Kim, T.K., Demiris, Y.: A syntactic approach to robot imitation learning using probabilistic activity grammars. Robot. Auton. Syst. 61, 1323\u20131334 (2013)","journal-title":"Robot. Auton. Syst."},{"key":"10_CR20","unstructured":"Lei, J., Berg, T.L., Bansal, M.: Revealing single frame bias for video-and-language learning. ArXiv abs\/2206.03428 (2022)"},{"key":"10_CR21","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.C.H.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning (2022)"},{"key":"10_CR22","doi-asserted-by":"crossref","unstructured":"Li, Y., Zhao, X., Chen, C., Pang, S., Zhou, Z., Yin, J.: Scenario-driven cyber-physical-social system: Intelligent workflow generation based on capability. In: Companion Proceedings of the ACM on Web Conference 2024 (2024)","DOI":"10.1145\/3589335.3651246"},{"key":"10_CR23","unstructured":"Lillicrap, T.P., Hunt, J.J., Pritzel, A., Heess, N.M.O., Erez, T., Tassa, Y., Silver, D., Wierstra, D.: Continuous control with deep reinforcement learning. CoRR abs\/1509.02971 (2015)"},{"key":"10_CR24","doi-asserted-by":"crossref","unstructured":"Liu, Y., Gupta, A., Abbeel, P., Levine, S.: Imitation from observation: learning to imitate behaviors from raw video via context translation. In: 2018 IEEE International Conference on Robotics and Automation (ICRA), pp. 1118\u20131125 (2017)","DOI":"10.1109\/ICRA.2018.8462901"},{"key":"10_CR25","doi-asserted-by":"crossref","unstructured":"Luo, H., et al.: CLIP4Clip: an empirical study of clip for end to end video clip retrieval. Neurocomputing 508, 293\u2013304 (2021)","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"10_CR26","unstructured":"Ma, Y.J., Liang, W., Som, V., Kumar, V., Zhang, A., Bastani, O., Jayaraman, D.: LIV: language-image representations and rewards for robotic control. In: International Conference on Machine Learning (2023)"},{"key":"10_CR27","unstructured":"Ma, Y.J., et al.: Eureka: human-level reward design via coding large language models. ArXiv abs\/2310.12931 (2023)"},{"key":"10_CR28","unstructured":"Ma, Y.J., Sodhani, S., Jayaraman, D., Bastani, O., Kumar, V., Zhang, A.: VIP: towards universal visual reward and representation via value-implicit pre-training. ArXiv abs\/2210.00030 (2022)"},{"key":"10_CR29","unstructured":"Nair, S., Mitchell, E., Chen, K., Ichter, B., Savarese, S., Finn, C.: Learning language-conditioned robot behavior from offline data and crowd-sourced annotation. In: CoRL (2021)"},{"key":"10_CR30","unstructured":"Nair, S., Rajeswaran, A., Kumar, V., Finn, C., Gupta, A.: R3M: a universal visual representation for robot manipulation. In: CoRL (2022)"},{"key":"10_CR31","doi-asserted-by":"crossref","unstructured":"Nguyen, A., Kanoulas, D., Muratore, L., Caldwell, D.G., Tsagarakis, N.G.: Translating videos to commands for robotic manipulation with deep recurrent neural networks. 2018 IEEE International Conference on Robotics and Automation (ICRA), pp.\u00a01\u20139 (2017)","DOI":"10.1109\/ICRA.2018.8460857"},{"key":"10_CR32","unstructured":"OpenAI: GPT-4 technical report. ArXiv abs\/2303.08774 (2023)"},{"key":"10_CR33","unstructured":"Parisi, S., Rajeswaran, A., Purushwalkam, S., Gupta, A.K.: The unsurprising effectiveness of pre-trained vision models for control. In: International Conference on Machine Learning (2022)"},{"key":"10_CR34","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"10_CR35","unstructured":"Radosavovic, I., Xiao, T., James, S., Abbeel, P., Malik, J., Darrell, T.: Real-world robot learning with masked visual pre-training. In: CoRL (2022)"},{"key":"10_CR36","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. ArXiv abs\/2204.06125 (2022)"},{"key":"10_CR37","doi-asserted-by":"publisher","first-page":"4007","DOI":"10.1109\/LRA.2018.2860057","volume":"3","author":"J Rothfuss","year":"2018","unstructured":"Rothfuss, J., Ferreira, F., Aksoy, E.E., Zhou, Y., Asfour, T.: Deep episodic memory: encoding, recalling, and predicting episodic experiences for robot action execution. IEEE Robot. Autom. Lett. 3, 4007\u20134014 (2018)","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10_CR38","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4757-4321-0","volume-title":"The Cross-Entropy Method: A Unified Approach to Combinatorial Optimization, Monte-Carlo Simulation, and Machine Learning","author":"RY Rubinstein","year":"2004","unstructured":"Rubinstein, R.Y., Kroese, D.P.: The Cross-Entropy Method: A Unified Approach to Combinatorial Optimization, Monte-Carlo Simulation, and Machine Learning, vol. 133. Springer, Heidelberg (2004). https:\/\/doi.org\/10.1007\/978-1-4757-4321-0"},{"key":"10_CR39","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. ArXiv abs\/2210.08402 (2022)"},{"key":"10_CR40","unstructured":"Schuhmann, C., et .: LAION-400M: open dataset of clip-filtered 400 million image-text pairs. ArXiv abs\/2111.02114 (2021)"},{"key":"10_CR41","doi-asserted-by":"crossref","unstructured":"Sermanet, P., et al.: Time-contrastive networks: Self-supervised learning from video. In: 2018 IEEE International Conference on Robotics and Automation (ICRA), pp. 1134\u20131141 (2017)","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"10_CR42","doi-asserted-by":"publisher","first-page":"1419","DOI":"10.1177\/02783649211046285","volume":"40","author":"L Shao","year":"2020","unstructured":"Shao, L., Migimatsu, T., Zhang, Q., Yang, K., Bohg, J.: Concept2Robot: learning manipulation concepts from instructions and human demonstrations. Int. J. Robot. Res. 40, 1419\u20131434 (2020)","journal-title":"Int. J. Robot. Res."},{"key":"10_CR43","unstructured":"Sharma, P., Pathak, D., Gupta, A.K.: Third-person visual imitation learning via decoupled hierarchical controller. In: Neural Information Processing Systems (2019)"},{"key":"10_CR44","unstructured":"Shaw, K., Bahl, S., Pathak, D.: VideoDex: learning dexterity from internet videos. In: Conference on Robot Learning (2022)"},{"key":"10_CR45","unstructured":"Shridhar, M., Manuelli, L., Fox, D.: CLIPort: what and where pathways for robotic manipulation. ArXiv abs\/2109.12098 (2021)"},{"key":"10_CR46","doi-asserted-by":"crossref","unstructured":"Singh, A., Yang, L., Hartikainen, K., Finn, C., Levine, S.: End-to-end robotic reinforcement learning without reward engineering. ArXiv abs\/1904.07854 (2019)","DOI":"10.15607\/RSS.2019.XV.073"},{"key":"10_CR47","doi-asserted-by":"crossref","unstructured":"Smith, L.M., Dhawan, N., Zhang, M., Abbeel, P., Levine, S.: AVID: learning multi-stage tasks via pixel-level translation of human videos. ArXiv abs\/1912.04443 (2019)","DOI":"10.15607\/RSS.2020.XVI.024"},{"key":"10_CR48","unstructured":"Stone, A., et al.: Open-world object manipulation using pre-trained vision-language models. ArXiv abs\/2303.00905 (2023)"},{"key":"10_CR49","doi-asserted-by":"crossref","unstructured":"Todorov, E., Erez, T., Tassa, Y.: MuJoCo: a physics engine for model-based control. In: 2012 IEEE\/RSJ International Conference on Intelligent Robots and Systems, pp. 5026\u20135033 (2012)","DOI":"10.1109\/IROS.2012.6386109"},{"key":"10_CR50","unstructured":"Wang, C., et al.: MimicPlay: long-horizon imitation learning by watching human play. ArXiv abs\/2302.12422 (2023)"},{"key":"10_CR51","unstructured":"Watter, M., Springenberg, J., Boedecker, J., Riedmiller, M.: Embed to control: a locally linear latent dynamics model for control from raw images. Advances in Neural Inf. Process. Syst. 28 (2015)"},{"key":"10_CR52","doi-asserted-by":"crossref","unstructured":"Wu, J., Fan, W., Chen, J., Liu, S., Li, Q., Tang, K.: Disentangled contrastive learning for social recommendation. In: Proceedings of the 31st ACM International Conference on Information & Knowledge Management (2022)","DOI":"10.1145\/3511808.3557583"},{"key":"10_CR53","unstructured":"Wu, J., et al.: Leveraging large language models (LLMs) to empower training-free dataset condensation for content-based recommendation. ArXiv abs\/2310.09874 (2023)"},{"key":"10_CR54","unstructured":"Wulfmeier, M., Ondruska, P., Posner, I.: Maximum entropy deep inverse reinforcement learning. arXiv Learning (2015)"},{"key":"10_CR55","unstructured":"Xiao, T., Radosavovic, I., Darrell, T., Malik, J.: Masked visual pre-training for motor control. ArXiv abs\/2203.06173 (2022)"},{"key":"10_CR56","unstructured":"Xie, T., et al.: Text2Reward: automated dense reward function generation for reinforcement learning. ArXiv abs\/2309.11489 (2023)"},{"key":"10_CR57","doi-asserted-by":"crossref","unstructured":"Xu, Y., Jiang, Y., Zhao, X., Li, Y., Li, R.: Personalized repository recommendation service for developers with multi-modal features learning. In: 2023 IEEE International Conference on Web Services (ICWS), pp. 455\u2013464 (2023)","DOI":"10.1109\/ICWS60048.2023.00064"},{"key":"10_CR58","doi-asserted-by":"publisher","first-page":"2027","DOI":"10.1109\/TCE.2023.3337351","volume":"70","author":"Y Xu","year":"2024","unstructured":"Xu, Y., Qiu, Z., Gao, H., Zhao, X., Wang, L., Li, R.: Heterogeneous data-driven failure diagnosis for microservice-based industrial clouds toward consumer digital ecosystems. IEEE Trans. Consum. Electron. 70, 2027\u20132037 (2024)","journal-title":"IEEE Trans. Consum. Electron."},{"key":"10_CR59","doi-asserted-by":"crossref","unstructured":"Yang, Y., Li, Y., Ferm\u00fcller, C., Aloimonos, Y.: Robot learning manipulation action plans by \u201cwatching\u201d unconstrained videos from the world wide web. In: AAAI Conference on Artificial Intelligence (2015)","DOI":"10.1609\/aaai.v29i1.9671"},{"key":"10_CR60","unstructured":"Yu, T., et al.: Meta-world: a benchmark and evaluation for multi-task and meta reinforcement learning. ArXiv abs\/1910.10897 (2019)"},{"key":"10_CR61","unstructured":"Yu, W., et al.: Language to rewards for robotic skill synthesis. ArXiv abs\/2306.08647 (2023)"},{"key":"10_CR62","unstructured":"Zakka, K., Zeng, A., Florence, P.R., Tompson, J., Bohg, J., Dwibedi, D.: XIRL: cross-embodiment inverse reinforcement learning. In: Conference on Robot Learning (2021)"},{"key":"10_CR63","unstructured":"Ziebart, B.D., Maas, A.L., Bagnell, J.A., Dey, A.K.: Maximum entropy inverse reinforcement learning. In: AAAI Conference on Artificial Intelligence (2008)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72998-0_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T18:04:41Z","timestamp":1727633081000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72998-0_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031729973","9783031729980"],"references-count":63,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72998-0_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}