{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,20]],"date-time":"2026-04-20T20:56:11Z","timestamp":1776718571059,"version":"3.51.2"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012274","type":"print"},{"value":"9783030012281","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01228-1_17","type":"book-chapter","created":{"date-parts":[[2018,10,6]],"date-time":"2018-10-06T01:03:51Z","timestamp":1538787831000},"page":"276-293","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":94,"title":["MT-VAE: Learning Motion Transformations to Generate Multimodal Human Dynamics"],"prefix":"10.1007","author":[{"given":"Xinchen","family":"Yan","sequence":"first","affiliation":[]},{"given":"Akash","family":"Rastogi","sequence":"additional","affiliation":[]},{"given":"Ruben","family":"Villegas","sequence":"additional","affiliation":[]},{"given":"Kalyan","family":"Sunkavalli","sequence":"additional","affiliation":[]},{"given":"Eli","family":"Shechtman","sequence":"additional","affiliation":[]},{"given":"Sunil","family":"Hadap","sequence":"additional","affiliation":[]},{"given":"Ersin","family":"Yumer","sequence":"additional","affiliation":[]},{"given":"Honglak","family":"Lee","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"issue":"3","key":"17_CR1","doi-asserted-by":"publisher","first-page":"98:1","DOI":"10.1145\/1360612.1360697","volume":"27","author":"E de Aguiar","year":"2008","unstructured":"de Aguiar, E., Stoll, C., Theobalt, C., Ahmed, N., Seidel, H.P., Thrun, S.: Performance capture from sparse multi-view video. ACM Trans. Graph. 27(3), 98:1\u201398:10 (2008)","journal-title":"ACM Trans. Graph."},{"issue":"4","key":"17_CR2","doi-asserted-by":"publisher","first-page":"75:1","DOI":"10.1145\/2010324.1964970","volume":"30","author":"T Beeler","year":"2011","unstructured":"Beeler, T., et al.: High-quality passive facial performance capture using anchor frames. ACM Trans. Graph. 30(4), 75:1\u201375:10 (2011)","journal-title":"ACM Trans. Graph."},{"issue":"4","key":"17_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2010324.1964955","volume":"30","author":"Fei Yang","year":"2011","unstructured":"Yang, F., Wang, J., Shechtman, E., Bourdev, L., Metaxas, D.: Expression flow for 3D-aware face component transfer. ACM Trans. Graph. (TOG), 30, 60 (2011)","journal-title":"ACM Transactions on Graphics"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: What makes tom hanks look like tom hanks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3952\u20133960 (2015)","DOI":"10.1109\/ICCV.2015.450"},{"issue":"4","key":"17_CR5","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing obama: learning lip sync from audio. ACM Trans. Graph. (TOG) 36(4), 95 (2017)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Sermanet, P., Lynch, C., Hsu, J., Levine, S.: Time-contrastive networks: self-supervised learning from multi-view observation. arXiv preprint arXiv:1704.06888 (2017)","DOI":"10.1109\/CVPRW.2017.69"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Rose, C., Guenter, B., Bodenheimer, B., Cohen, M.F.: Efficient generation of motion transitions using spacetime constraints. In: SIGGRAPH (1996)","DOI":"10.1145\/237170.237229"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Bregler, C.: Learning and recognizing human dynamics in video sequences. In: 1997 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, Proceedings, pp. 568\u2013574. IEEE (1997)","DOI":"10.1109\/CVPR.1997.609382"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Efros, A.A., Berg, A.C., Mori, G., Malik, J.: Recognizing action at a distance. In: Null, p. 726. IEEE (2003)","DOI":"10.1109\/ICCV.2003.1238420"},{"issue":"12","key":"17_CR10","doi-asserted-by":"publisher","first-page":"2247","DOI":"10.1109\/TPAMI.2007.70711","volume":"29","author":"L Gorelick","year":"2007","unstructured":"Gorelick, L., Blank, M., Shechtman, E., Irani, M., Basri, R.: Actions as space-time shapes. IEEE Trans. Pattern Anal. Mach. Intell. 29(12), 2247\u20132253 (2007)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"2\u20133","key":"17_CR11","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1007\/s11263-005-1838-7","volume":"64","author":"I Laptev","year":"2005","unstructured":"Laptev, I.: On space-time interest points. Int. J. Comput. Vis. 64(2\u20133), 107\u2013123 (2005)","journal-title":"Int. J. Comput. Vis."},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C., Liu, C.L.: Action recognition by dense trajectories. In: 2011 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3169\u20133176. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"17_CR13","doi-asserted-by":"crossref","unstructured":"Wang, J., Liu, Z., Wu, Y., Yuan, J.: Mining actionlet ensemble for action recognition with depth cameras. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1290\u20131297. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6247813"},{"key":"17_CR14","doi-asserted-by":"crossref","unstructured":"Walker, J., Gupta, A., Hebert, M.: Dense optical flow prediction from a static image. In: 2015 IEEE International Conference on Computer Vision (ICCV), pp. 2443\u20132451. IEEE (2015)","DOI":"10.1109\/ICCV.2015.281"},{"key":"17_CR15","unstructured":"Fischer, P., et al.: Flownet: Learning optical flow with convolutional networks. arXiv preprint arXiv:1504.06852 (2015)"},{"key":"17_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"835","DOI":"10.1007\/978-3-319-46478-7_51","volume-title":"Computer Vision \u2013 ECCV 2016","author":"J Walker","year":"2016","unstructured":"Walker, J., Doersch, C., Gupta, A., Hebert, M.: An uncertain future: forecasting from static images using variational autoencoders. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 835\u2013851. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_51"},{"key":"17_CR17","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., Yang, J., Price, B., Cohen, S., Deng, J.: Forecasting human dynamics from static images. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.388"},{"key":"17_CR18","unstructured":"Villegas, R., Yang, J., Zou, Y., Sohn, S., Lin, X., Lee, H.: Learning to generate long-term future via hierarchical prediction. In: ICML (2017)"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Walker, J., Marino, K., Gupta, A., Hebert, M.: The pose knows: Video forecasting by generating pose futures. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 3352\u20133361. IEEE (2017)","DOI":"10.1109\/ICCV.2017.361"},{"key":"17_CR20","unstructured":"Li, Z., Zhou, Y., Xiao, S., He, C., Huang, Z., Li, H.: Auto-conditioned recurrent networks for extended complex human motion synthesis. In: ICLR (2018)"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Yang, F., Bourdev, L., Shechtman, E., Wang, J., Metaxas, D.: Facial expression editing in video using a temporally-smooth factorization. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 861\u2013868. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6247759"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Thies, J., Zollhofer, M., Stamminger, M., Theobalt, C., Nie\u00dfner, M.: Face2face: real-time face capture and reenactment of RGB videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2387\u20132395 (2016)","DOI":"10.1109\/CVPR.2016.262"},{"issue":"6","key":"17_CR23","doi-asserted-by":"publisher","first-page":"196","DOI":"10.1145\/3130800.3130818","volume":"36","author":"H Averbuch-Elor","year":"2017","unstructured":"Averbuch-Elor, H., Cohen-Or, D., Kopf, J., Cohen, M.F.: Bringing portraits to life. ACM Trans. Graph. 36(6), 196 (2017). (Proceeding of SIGGRAPH Asia 2017)","journal-title":"ACM Trans. Graph."},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Blanz, V., Vetter, T.: A morphable model for the synthesis of 3D faces. In: Proceedings of the 26th Annual Conference on Computer Graphics and Interactive Techniques, pp. 187\u2013194. ACM Press\/Addison-Wesley Publishing Co. (1999)","DOI":"10.1145\/311535.311556"},{"key":"17_CR25","unstructured":"Srivastava, N., Mansimov, E., Salakhudinov, R.: Unsupervised learning of video representations using lstms. In: International Conference on Machine Learning, pp. 843\u2013852 (2015)"},{"key":"17_CR26","unstructured":"Mathieu, M., Couprie, C., LeCun, Y.: Deep multi-scale video prediction beyond mean square error. In: ICLR (2016)"},{"key":"17_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1007\/978-3-642-21735-7_6","volume-title":"Artificial Neural Networks and Machine Learning \u2013 ICANN 2011","author":"GE Hinton","year":"2011","unstructured":"Hinton, G.E., Krizhevsky, A., Wang, S.D.: Transforming auto-encoders. In: Honkela, T., Duch, W., Girolami, M., Kaski, S. (eds.) ICANN 2011. LNCS, vol. 6791, pp. 44\u201351. Springer, Heidelberg (2011). https:\/\/doi.org\/10.1007\/978-3-642-21735-7_6"},{"key":"17_CR28","unstructured":"Oh, J., Guo, X., Lee, H., Lewis, R.L., Singh, S.: Action-conditional video prediction using deep networks in atari games. In: NIPS (2015)"},{"key":"17_CR29","unstructured":"Finn, C., Goodfellow, I.J., Levine, S.: Unsupervised learning for physical interaction through video prediction. In: NIPS (2016)"},{"key":"17_CR30","unstructured":"Yang, J., Reed, S.E., Yang, M.H., Lee, H.: Weakly-supervised disentangling with recurrent transformations for 3D view synthesis. In: Advances in Neural Information Processing Systems, pp. 1099\u20131107 (2015)"},{"key":"17_CR31","unstructured":"Villegas, R., Yang, J., Hong, S., Lin, X., Lee, H.: Decomposing motion and content for natural video sequence prediction. ICLR 1(2), 7 (2017)"},{"key":"17_CR32","unstructured":"Denton, E.L., Birodkar, V.: Unsupervised learning of disentangled representations from video. In: Advances in Neural Information Processing Systems, pp. 4417\u20134426 (2017)"},{"key":"17_CR33","unstructured":"Xue, T., Wu, J., Bouman, K., Freeman, B.: Visual dynamics: probabilistic future frame synthesis via cross convolutional networks. In: NIPS, pp. 91\u201399 (2016)"},{"key":"17_CR34","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: NIPS, pp. 613\u2013621 (2016)"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M.Y., Yang, X., Kautz, J.: Mocogan: decomposing motion and content for video generation. arXiv preprint arXiv:1707.04993 (2017)","DOI":"10.1109\/CVPR.2018.00165"},{"key":"17_CR36","unstructured":"Wichers, N., Villegas, R., Erhan, D., Lee, H.: Hierarchical long-term video prediction without supervision. In: ICML"},{"key":"17_CR37","unstructured":"Kalchbrenner, N., et al.: Video pixel networks. arXiv preprint arXiv:1610.00527 (2016)"},{"key":"17_CR38","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: Advances in Neural Information Processing Systems, pp. 3111\u20133119 (2013)"},{"key":"17_CR39","unstructured":"Kulkarni, T.D., Whitney, W.F., Kohli, P., Tenenbaum, J.: Deep convolutional inverse graphics network. In: Advances in Neural Information Processing Systems, pp. 2539\u20132547 (2015)"},{"key":"17_CR40","unstructured":"Reed, S.E., Zhang, Y., Zhang, Y., Lee, H.: Deep visual analogy-making. In: Advances in Neural Information Processing Systems, pp. 1252\u20131260 (2015)"},{"key":"17_CR41","doi-asserted-by":"crossref","unstructured":"Wang, X., Farhadi, A., Gupta, A.: Actions\u00a0transformations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2658\u20132667 (2016)","DOI":"10.1109\/CVPR.2016.291"},{"key":"17_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"262","DOI":"10.1007\/978-3-319-46484-8_16","volume-title":"Computer Vision \u2013 ECCV 2016","author":"Y Zhou","year":"2016","unstructured":"Zhou, Y., Berg, T.L.: Learning temporal transformations from time-lapse videos. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 262\u2013277. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_16"},{"key":"17_CR43","unstructured":"Sohn, K., Yan, X., Lee, H.: Learning structured output representation using deep conditional generative models. In: Advances in Neural Information Processing Systems, pp. 3483\u20133491 (2015)"},{"key":"17_CR44","unstructured":"Zhu, J.Y., et al.: Toward multimodal image-to-image translation. In: Advances in Neural Information Processing Systems, pp. 465\u2013476 (2017)"},{"key":"17_CR45","unstructured":"Ha, D., Eck, D.: A neural representation of sketch drawings. In: ICLR (2018)"},{"key":"17_CR46","doi-asserted-by":"crossref","unstructured":"Bowman, S.R., Vilnis, L., Vinyals, O., Dai, A.M., Jozefowicz, R., Bengio, S.: Generating sentences from a continuous space. arXiv preprint arXiv:1511.06349 (2015)","DOI":"10.18653\/v1\/K16-1002"},{"key":"17_CR47","unstructured":"Hu, Z., Yang, Z., Liang, X., Salakhutdinov, R., Xing, E.P.: Controllable text generation. arXiv preprint arXiv:1703.00955 (2017)"},{"key":"17_CR48","unstructured":"Babaeizadeh, M., Finn, C., Erhan, D., Campbell, R.H., Levine, S.: Stochastic variational video prediction. In: ICLR (2018)"},{"key":"17_CR49","unstructured":"Denton, E., Fergus, R.: Stochastic video generation with a learned prior. In: ICML (2018)"},{"issue":"8","key":"17_CR50","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"17_CR51","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"17_CR52","unstructured":"Gregor, K., Danihelka, I., Graves, A., Rezende, D., Wierstra, D.: Draw: a recurrent neural network for image generation. In: International Conference on Machine Learning, pp. 1462\u20131471 (2015)"},{"key":"17_CR53","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"776","DOI":"10.1007\/978-3-319-46493-0_47","volume-title":"Computer Vision \u2013 ECCV 2016","author":"X Yan","year":"2016","unstructured":"Yan, X., Yang, J., Sohn, K., Lee, H.: Attribute2Image: conditional image generation from visual attributes. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 776\u2013791. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_47"},{"issue":"1","key":"17_CR54","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1111\/tops.12009","volume":"5","author":"KA Smith","year":"2013","unstructured":"Smith, K.A., Vul, E.: Sources of uncertainty in intuitive physics. Top. Cogn. Sci. 5(1), 185\u2013199 (2013)","journal-title":"Top. Cogn. Sci."},{"key":"17_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"689","DOI":"10.1007\/978-3-319-10578-9_45","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T Lan","year":"2014","unstructured":"Lan, T., Chen, T.-C., Savarese, S.: A hierarchical representation for future action prediction. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8691, pp. 689\u2013704. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10578-9_45"},{"key":"17_CR56","unstructured":"Zafeiriou, S., Kollias, D., Nicolaou, M.A., Papaioannou, A., Zhao, G., Kotsia, I.: Aff-wild: valence and arousal in-the-wild challenge"},{"issue":"7","key":"17_CR57","doi-asserted-by":"publisher","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","volume":"36","author":"C Ionescu","year":"2014","unstructured":"Ionescu, C., Papava, D., Olaru, V., Sminchisescu, C.: Human3. 6m: large scale datasets and predictive methods for 3D human sensing in natural environments. IEEE Trans. Pattern Anal. Mach. Intell. 36(7), 1325\u20131339 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"17_CR58","doi-asserted-by":"crossref","unstructured":"Paysan, P., Knothe, R., Amberg, B., Romdhani, S., Vetter, T.: A 3D face model for pose and illumination invariant face recognition. Genova, Italy. IEEE (2009)","DOI":"10.1109\/AVSS.2009.58"},{"key":"17_CR59","doi-asserted-by":"crossref","unstructured":"Tran, A.T., Hassner, T., Masi, I., Medioni, G.: Regressing robust and discriminative 3D morphable models with a very deep neural network. In: Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.163"},{"key":"17_CR60","doi-asserted-by":"crossref","unstructured":"Zhu, X., Lei, Z., Liu, X., Shi, H., Li, S.Z.: Face alignment across large poses: A 3D solution. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 146\u2013155 (2016)","DOI":"10.1109\/CVPR.2016.23"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01228-1_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T18:43:42Z","timestamp":1775241822000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01228-1_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012274","9783030012281"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01228-1_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}