{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T00:40:01Z","timestamp":1767141601396,"version":"build-2238731810"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2019,10,11]],"date-time":"2019-10-11T00:00:00Z","timestamp":1570752000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,10,11]],"date-time":"2019-10-11T00:00:00Z","timestamp":1570752000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1007\/s11263-019-01238-5","type":"journal-article","created":{"date-parts":[[2019,10,30]],"date-time":"2019-10-30T14:38:30Z","timestamp":1572446310000},"page":"1360-1374","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Model-Based Robot Imitation with Future Image Similarity"],"prefix":"10.1007","volume":"128","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4360-9575","authenticated-orcid":false,"given":"A.","family":"Wu","sequence":"first","affiliation":[]},{"given":"A. J.","family":"Piergiovanni","sequence":"additional","affiliation":[]},{"given":"M. S.","family":"Ryoo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,10,11]]},"reference":[{"key":"1238_CR1","doi-asserted-by":"crossref","unstructured":"Abbeel, P., & Ng, A. Y. (2004) Apprenticeship learning via inverse reinforcement learning. In International conference on machine learning (ICML).","DOI":"10.1145\/1015330.1015430"},{"key":"1238_CR2","doi-asserted-by":"publisher","first-page":"469","DOI":"10.1016\/j.robot.2008.10.024","volume":"31","author":"BD Argall","year":"2009","unstructured":"Argall, B. D., Chernova, S., Veloso, M., & Browning, B. (2009). A survey of robot learning from demonstration. Robotics and Autonomous Systems, 31, 469\u2013483.","journal-title":"Robotics and Autonomous Systems"},{"key":"1238_CR3","unstructured":"Babaeizadeh, M., Finn, C., Erhan, D., Campbell, R. H., & Levine, S. (2017). Stochastic variational video prediction. In CoRR.\u00a0\nhttp:\/\/arxiv.org\/abs\/1710.11252\n\n."},{"key":"1238_CR4","unstructured":"Baram, N., Anschel, O., Caspi, I., & Mannor, S. (2017). End-to-end differentiable adversarial imitation learning. In International conference on machine learning (ICML) (pp. 390\u2013399)."},{"key":"1238_CR5","unstructured":"Bojarski, M., Del Testa, D., Dworakowski, D., Firner, B., Flepp, B., Goyal, P., Jackel, L. D., Monfort, M., Muller, U., Zhang, J., et\u00a0al. (2016) End to end learning for self-driving cars. \narXiv:1604.07316\n\n."},{"key":"1238_CR6","doi-asserted-by":"crossref","unstructured":"Carreira, J., & Zisserman, A. (2017). Quo vadis, action recognition? A new model and the kinetics dataset. In IEEE conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2017.502"},{"key":"1238_CR7","unstructured":"Chao, Y. W., Yang, J., Price, B., Cohen, S., & Deng, J. (2016). Forecasting human dynamics from static images. In: IEEE conference on computer vision and pattern recognition (CVPR)."},{"key":"1238_CR8","unstructured":"Chiappa, S., Racani\u00e8re, S., Wierstra, D., & Mohamed, S. (2017). Recurrent environment simulators. In CoRR. \nhttp:\/\/arxiv.org\/abs\/1704.02254\n\n."},{"key":"1238_CR9","unstructured":"Denton, E., & Fergus, R. (2018). Stochastic video generation with a learned prior. In CoRR. \narXiv:1802.07687\n\n."},{"issue":"4","key":"1238_CR10","first-page":"692","volume":"39","author":"A Dosovitskiy","year":"2017","unstructured":"Dosovitskiy, A., Springenberg, J. T., Tatarchenko, M., & Brox, T. (2017). Learning to generate chairs, tables and cars with convolutional networks. IEEE Transactions on Pattern Analysis and Machine Intelligence, 39(4), 692\u2013705.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1238_CR11","doi-asserted-by":"crossref","unstructured":"Finn, C., & Levine, S. (2017). Deep visual foresight for planning robot motion. In IEEE international conference on robotics and automation (ICRA). IEEE (pp. 2786\u20132793).","DOI":"10.1109\/ICRA.2017.7989324"},{"key":"1238_CR12","unstructured":"Finn, C., Goodfellow, I.\u00a0J., & Levine, S. (2016). Unsupervised learning for physical interaction through video prediction. In CoRR. \nhttp:\/\/arxiv.org\/abs\/1605.07157"},{"key":"1238_CR13","unstructured":"Finn, C., Levine, S., & Abbeel, P. (2016). Guided cost learning: Deep inverse optimal control via policy optimization. \narXiv:1603.00448\n\n."},{"issue":"2","key":"1238_CR14","doi-asserted-by":"publisher","first-page":"661","DOI":"10.1109\/LRA.2015.2509024","volume":"1","author":"A Giusti","year":"2016","unstructured":"Giusti, A., Guzzi, J., Cire\u015fan, D. C., He, F.-L., Rodr\u00edguez, J. P., Fontana, F., et al. (2016). A machine learning approach to visual perception of forest trails for mobile robots. IEEE Robotics and Automation Letters, 1(2), 661\u2013667.","journal-title":"IEEE Robotics and Automation Letters"},{"key":"1238_CR15","unstructured":"Ho, J., & Ermon, S. (2016). Generative adversarial imitation learning. In Advances in neural information processing systems (NIPS)."},{"key":"1238_CR16","unstructured":"Ho, J., Gupta, J., & Ermon, S. (2016). Model-free imitation learning with policy optimization. \narXiv:1605.08478\n\n."},{"key":"1238_CR17","unstructured":"Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. \narXiv:1412.6980\n\n."},{"key":"1238_CR18","unstructured":"Laskey, M., Lee, J., Hsieh, W., Liaw, R., Mahler, J., Fox, R., & Goldberg, K. (2017). Iterative noise injection for scalable imitation learning. \narXiv:1703.09327\n\n."},{"key":"1238_CR19","unstructured":"Lee, J., & Ryoo, M. S. (2017). Learning robot activities from first-person human videos using convolutional future regression. In IEEE\/RSJ international conference on intelligent robots and systems (IROS)."},{"key":"1238_CR20","unstructured":"Levine, S., Pastor, P., Krizhevsky, A., & Quillen, D. (2016). Learning hand-eye coordination for robotic grasping with large-scale data collection. In International symposium on experimental robotics (pp. 173\u2013184). Springer."},{"key":"1238_CR21","doi-asserted-by":"crossref","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C. Y., & Berg, A. C. (2016). SSD: Single shot multibox detector. In European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"1238_CR22","unstructured":"Liu, Y., Gupta, A., Abbeel, P., & Levine, S. (2018). Imitation from observation: learning to imitate behaviors from raw video via context translation. \narXiv:1707.03374\n\n."},{"key":"1238_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Z., Yeh, R. A., Tang, X., Liu, Y., & Agarwala, A. (2017). Video frame synthesis using deep voxel flow. In IEEE international conference on computer vision (ICCV).","DOI":"10.1109\/ICCV.2017.478"},{"issue":"7540","key":"1238_CR24","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1038\/nature14236","volume":"518","author":"V Mnih","year":"2015","unstructured":"Mnih, V., Kavukcuoglu, K., Silver, D., Rusu, A. A., Veness, J., Bellemare, M. G., et al. (2015). Human-level control through deep reinforcement learning. Nature, 518(7540), 529.","journal-title":"Nature"},{"key":"1238_CR25","unstructured":"Ng, A. Y., & Jordan, M. I. (2000). Inverse reinforcement learning. In International conference on machine learning (ICML)."},{"key":"1238_CR26","unstructured":"Oh, J., Guo, X., Lee, H., Lewis, R. L., & Singh, S. (2015). Action-conditional video prediction using deep networks in atari games. In CoRR. \narXiv:1507.08750\n\n."},{"key":"1238_CR27","unstructured":"Pathak, D., Mahmoudieh, P., Luo, G., Agrawal, P., Chen, D., Shentu, Y., Shelhamer, E., Malik, Y., Efros, A. A., & Darrell, T. (2018). Zero-shot visual imitation. \narXiv:1804.08606\n\n."},{"key":"1238_CR28","doi-asserted-by":"crossref","unstructured":"Peng, X. B., Abbeel, P., Levine, S., & van de Panne, M. (2018). Deepmimic: Example-guided deep reinforcement learning of physics-based character skills. In ACM SIGGRAPH.","DOI":"10.1145\/3197517.3201311"},{"key":"1238_CR29","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A. J., & Ryoo, M. S. (2018). Learning latent super-events to detect multiple activities in videos. In IEEE conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2018.00556"},{"key":"1238_CR30","unstructured":"Pomerleau, D. A. (1989). Alvinn: An autonomous land vehicle in a neural network. In Advances in neural information processing systems (NIPS) (pp. 305\u2013313)."},{"issue":"1","key":"1238_CR31","doi-asserted-by":"publisher","first-page":"88","DOI":"10.1162\/neco.1991.3.1.88","volume":"3","author":"DA Pomerleau","year":"1991","unstructured":"Pomerleau, D. A. (1991). Efficient training of artificial neural networks for autonomous navigation. Neural Computation, 3(1), 88\u201397.","journal-title":"Neural Computation"},{"key":"1238_CR32","unstructured":"Radford, A., Metz, L., & Chintala, S. (2015). Unsupervised representation learning with deep convolutional generative adversarial networks. \narXiv:1511.06434\n\n."},{"key":"1238_CR33","unstructured":"Ross, S., Gordon, G., & Bagnell, D. (2011). A reduction of imitation learning and structured prediction to no-regret online learning. In International conference on artificial intelligence and statistics (pp. 627\u2013635)."},{"key":"1238_CR34","unstructured":"Sadeghi, F., Toshev, A., Jang, E., & Levine, S. (2017). Sim2real view invariant visual servoing by recurrent control. \narXiv:1712.07642\n\n."},{"issue":"5","key":"1238_CR35","doi-asserted-by":"publisher","first-page":"561","DOI":"10.3233\/IDA-2007-11508","volume":"11","author":"S Salvador","year":"2004","unstructured":"Salvador, S., & Chan, P. (2004). Fastdtw: Toward accurate dynamic time warping in linear time and space. Intelligent Data Analysis, 11(5), 561\u2013580.","journal-title":"Intelligent Data Analysis"},{"key":"1238_CR36","unstructured":"Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. \narXiv:1409.1556\n\n."},{"key":"1238_CR37","volume-title":"Reinforcement learning: An introduction","author":"RS Sutton","year":"2018","unstructured":"Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. Cambridge: MIT press."},{"key":"1238_CR38","doi-asserted-by":"crossref","unstructured":"Tatarchenko, M., Dosovitskiy, A., & Brox, T. (2016). Multi-view 3D models from single images with a convolutional network. In European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-319-46478-7_20"},{"key":"1238_CR39","unstructured":"Torabi, F., Warnell, G., & Stone, P. (2018). Behavioral cloning from observation. \narXiv:1805.01954\n\n."},{"issue":"4","key":"1238_CR40","doi-asserted-by":"publisher","first-page":"1039","DOI":"10.1109\/TSMCB.2012.2185694","volume":"42","author":"A Vakanski","year":"2012","unstructured":"Vakanski, A., Mantegh, I., Irish, A., & Janabi-Sharifi, F. (2012). Trajectory learning for robot programming by demonstration using hidden markov model and dynamic time warping. IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), 42(4), 1039\u20131052.","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics)"},{"key":"1238_CR41","doi-asserted-by":"crossref","unstructured":"Vondrick, C., Pirsiavash, H., & Torralba, A. (2016). Anticipating visual representations from unlabeled video. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 98\u2013106).","DOI":"10.1109\/CVPR.2016.18"},{"key":"1238_CR42","doi-asserted-by":"crossref","unstructured":"Walker, J., Gupta, A., & Hebert, M. (2014). Patch to the future: Unsupervised visual prediction. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 3302\u20133309).","DOI":"10.1109\/CVPR.2014.416"},{"key":"1238_CR43","doi-asserted-by":"crossref","unstructured":"Walker, J., Marino, K., Gupta, A., & Hebert, M. (2017). The pose knows: Video forecasting by generating pose futures. In IEEE international conference on computer vision (ICCV) (pp. 3352\u20133361).","DOI":"10.1109\/ICCV.2017.361"},{"issue":"4","key":"1238_CR44","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). Image quality assessment: From error visibility to structural similarity. IEEE Transactions on Image Processing, 13(4), 600\u2013612.","journal-title":"IEEE Transactions on Image Processing"},{"key":"1238_CR45","unstructured":"Wulfmeier, M., Ondruska, P., & Posner, I. (2015). Deep inverse reinforcement learning. \narXiv:1507.04888\n\n."},{"key":"1238_CR46","doi-asserted-by":"crossref","unstructured":"Zhou, T., Tulsiani, S., Sun, W., Malik, J., & Efros, A. A. (2016). View synthesis by appearance flow. In European conference on computer vision (ECCV) (2016) (pp. 286\u2013301).","DOI":"10.1007\/978-3-319-46493-0_18"},{"key":"1238_CR47","unstructured":"Zhu, Y., Mottaghi, R., Kolve, E., Lim, J. J., Gupta, A., Fei-Fei, L., & Farhadi, A. (2016). Target-driven visual navigation in indoor scenes using deep reinforcement learning. \narXiv:1609.05143\n\n."},{"key":"1238_CR48","unstructured":"Ziebart, B. D., Maas, A., Bagnell, J. A., & Dey, A. K. (2008). Maximum entropy inverse reinforcement learning. In PAAAI conference on artificial intelligence (AAAI)."}],"updated-by":[{"DOI":"10.1007\/s11263-019-01272-3","type":"correction","label":"Correction","source":"publisher","updated":{"date-parts":[[2019,12,9]],"date-time":"2019-12-09T00:00:00Z","timestamp":1575849600000}}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01238-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-019-01238-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01238-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,10,9]],"date-time":"2020-10-09T19:40:46Z","timestamp":1602272446000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-019-01238-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,11]]},"references-count":48,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2020,5]]}},"alternative-id":["1238"],"URL":"https:\/\/doi.org\/10.1007\/s11263-019-01238-5","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,10,11]]},"assertion":[{"value":"27 July 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 September 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 October 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 December 2019","order":4,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Correction","order":5,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The acknowledgement section was omitted in the original version of this article, which is given below.","order":6,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}}]}}