{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T05:54:15Z","timestamp":1771480455421,"version":"3.50.1"},"reference-count":63,"publisher":"Springer Science and Business Media LLC","issue":"10-11","license":[{"start":{"date-parts":[[2020,5,28]],"date-time":"2020-05-28T00:00:00Z","timestamp":1590624000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,5,28]],"date-time":"2020-05-28T00:00:00Z","timestamp":1590624000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2020,11]]},"DOI":"10.1007\/s11263-020-01334-x","type":"journal-article","created":{"date-parts":[[2020,5,28]],"date-time":"2020-05-28T06:02:26Z","timestamp":1590645746000},"page":"2552-2569","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["High-Quality Video Generation from Static Structural Annotations"],"prefix":"10.1007","volume":"128","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8525-9163","authenticated-orcid":false,"given":"Lu","family":"Sheng","sequence":"first","affiliation":[]},{"given":"Junting","family":"Pan","sequence":"additional","affiliation":[]},{"given":"Jiaming","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Chen Change","family":"Loy","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,5,28]]},"reference":[{"issue":"6","key":"1334_CR1","doi-asserted-by":"publisher","first-page":"26","DOI":"10.1109\/MSP.2017.2743240","volume":"34","author":"K Arulkumaran","year":"2017","unstructured":"Arulkumaran, K., Deisenroth, M. P., Brundage, M., & Bharath, A. A. (2017). Deep reinforcement learning: a brief survey. IEEE Signal Processing Magazine, 34(6), 26\u201338.","journal-title":"IEEE Signal Processing Magazine"},{"key":"1334_CR2","unstructured":"Babaeizadeh, M., Finn, C., Erhan, D., Campbell, R.H., Levine, S. (2017). Stochastic variational video prediction. ICLR"},{"key":"1334_CR3","doi-asserted-by":"crossref","unstructured":"Balakrishnan, G., Zhao, A., Dalca, A.V., Durand, F., Guttag, J. (2018). Synthesizing images of humans in unseen poses. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2018.00870"},{"key":"1334_CR4","doi-asserted-by":"crossref","unstructured":"Bousmalis, K., Silberman, N., Dohan, D., Erhan, D., Krishnan, D. (2017). Unsupervised pixel-level domain adaptation with generative adversarial networks. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2017.18"},{"key":"1334_CR5","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A. (2017). Quo vadis, action recognition? a new model and the kinetics dataset. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2017.502"},{"key":"1334_CR6","doi-asserted-by":"crossref","unstructured":"Chen, B., Wang, W., Wang, J. (2017). Video imagination from a single image with transformation generation. In: ACM MM, ACM, pp 358\u2013366.","DOI":"10.1145\/3126686.3126737"},{"key":"1334_CR7","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Zhu, Y., Papandreou, G., Schroff, F., Adam, H. (2018). Encoder-decoder with atrous separable convolution for semantic image segmentation. In: ECCV.","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"1334_CR8","doi-asserted-by":"crossref","unstructured":"Cordts, M., Omran, M., Ramos, S., Rehfeld, T., Enzweiler, M., Benenson, R., Franke, U., Roth, S., Schiele, B. (2016). The cityscapes dataset for semantic urban scene understanding. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2016.350"},{"key":"1334_CR9","unstructured":"Denton, E., Fergus, R. (2018). Stochastic video generation with a learned prior. ICML"},{"key":"1334_CR10","doi-asserted-by":"crossref","unstructured":"Dosovitskiy, A., Fischer, P., Ilg, E., Hausser, P., Hazirbas, C., Golkov, V., van\u00a0der Smagt, P., Cremers, D., Brox, T. (2015). Flownet: Learning optical flow with convolutional networks. In: ICCV, IEEE, pp 2758\u20132766.","DOI":"10.1109\/ICCV.2015.316"},{"key":"1334_CR11","unstructured":"Finn, C., Goodfellow, I., Levine, S. (2016). Unsupervised learning for physical interaction through video prediction. In: NIPS, pp 64\u201372."},{"key":"1334_CR12","doi-asserted-by":"crossref","unstructured":"Ganin, Y., Kononenko, D., Sungatullina, D., Lempitsky, V. (2016). Deepwarp: Photorealistic image resynthesis for gaze manipulation. In: ECCV, Springer, pp 311\u2013326.","DOI":"10.1007\/978-3-319-46475-6_20"},{"key":"1334_CR13","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R. (2013). Vision meets robotics: The kitti dataset. IJRR.","DOI":"10.1177\/0278364913491297"},{"key":"1334_CR14","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., Bengio, Y. (2014). Generative adversarial nets. In: NIPS, pp 2672\u20132680."},{"key":"1334_CR15","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S. (2017) GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: NIPS, pp 6626\u20136637"},{"key":"1334_CR16","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.Y., Zhou, T., Efros, A.A. (2017). Image-to-image translation with conditional adversarial networks. CVPR.","DOI":"10.1109\/CVPR.2017.632"},{"key":"1334_CR17","unstructured":"Jaderberg, M., Simonyan, K., Zisserman, A., et\u00a0al. (2015). Spatial transformer networks. In: NIPS, pp 2017\u20132025."},{"key":"1334_CR18","doi-asserted-by":"crossref","unstructured":"Jiang, H., Sun, D., Jampani, V., Yang, M.H., Learned-Miller, E., Kautz, J. (2018). Super SloMo: High quality estimation of multiple intermediate frames for video interpolation. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2018.00938"},{"key":"1334_CR19","doi-asserted-by":"crossref","unstructured":"Johnson, J., Alahi, A., Fei-Fei, L. (2016). Perceptual losses for real-time style transfer and super-resolution. In: ECCV, Springer, pp 694\u2013711.","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"1334_CR20","doi-asserted-by":"crossref","unstructured":"Johnson, J., Gupta, A., Fei-Fei, L. (2018). Image generation from scene graphs. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2018.00133"},{"key":"1334_CR21","unstructured":"Kalchbrenner, N., Oord, Avd., Simonyan, K., Danihelka, I., Vinyals, O., Graves, A., Kavukcuoglu, K. (2016). Video pixel networks. arXiv preprint arXiv:1610.00527."},{"key":"1334_CR22","unstructured":"Kingma, D.P., Welling, M. (2013). Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114."},{"key":"1334_CR23","unstructured":"Laptev, I., Caputo, B., et\u00a0al. (2004) Recognizing human actions: a local svm approach. In: ICPR, IEEE, pp 32\u201336."},{"key":"1334_CR24","doi-asserted-by":"crossref","unstructured":"Li, Y., Fang, C., Yang, J., Wang, Z., Lu, X., Yang, M.H. (2018). Flow-grounded spatial-temporal video prediction from still images. In: ECCV, Springer.","DOI":"10.1007\/978-3-030-01240-3_37"},{"key":"1334_CR25","doi-asserted-by":"crossref","unstructured":"Liang, X., Lee, L., Dai, W., Xing, E.P. (2017). Dual motion GAN for future-flow embedded video prediction. In: ICCV, IEEE.","DOI":"10.1109\/ICCV.2017.194"},{"key":"1334_CR26","doi-asserted-by":"crossref","unstructured":"Liu, G., Reda, F.A., Shih, K.J., Wang, T.C., Tao, A., Catanzaro, B. (2018). Image inpainting for irregular holes using partial convolutions. In: ECCV, Springer.","DOI":"10.1007\/978-3-030-01252-6_6"},{"key":"1334_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Z., Yeh, R., Tang, X., Liu, Y., Agarwala, A. (2017). Video frame synthesis using deep voxel flow. In: ICCV, IEEE.","DOI":"10.1109\/ICCV.2017.478"},{"key":"1334_CR28","doi-asserted-by":"crossref","unstructured":"Luo, Z., Peng, B., Huang, D.A., Alahi, A., Fei-Fei, L. (2017). Unsupervised learning of long-term motion dynamics for videos. arXiv preprint arXiv:1701.01821.","DOI":"10.1109\/CVPR.2017.751"},{"key":"1334_CR29","unstructured":"Ma, L., Jia, X., Sun, Q., Schiele, B., Tuytelaars, T., Van\u00a0Gool, L. (2017). Pose guided person image generation. In: NIPS, pp 406\u2013416."},{"key":"1334_CR30","unstructured":"Mathieu, M., Couprie, C., LeCun, Y. (2015). Deep multi-scale video prediction beyond mean square error. arXiv preprint arXiv:1511.05440."},{"key":"1334_CR31","unstructured":"Meister, S., Hur, J., Roth, S. (2018). UnFlow: Unsupervised learning of optical flow with a bidirectional census loss. In: AAAI, New Orleans, Louisiana."},{"key":"1334_CR32","unstructured":"Oord, Avd, Kalchbrenner, N., Kavukcuoglu, K. (2016). Pixel recurrent neural networks. ICML."},{"key":"1334_CR33","doi-asserted-by":"crossref","unstructured":"Pan, J., Wang, C., Jia, X., Shao, J., Sheng, L., Yan, J., Wang, X. (2019). Video generation from single semantic label map. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2019.00385"},{"key":"1334_CR34","unstructured":"Patraucean, V., Handa, A., Cipolla, R. (2015). Spatio-temporal video autoencoder with differentiable memory. arXiv preprint arXiv:1511.06309."},{"key":"1334_CR35","doi-asserted-by":"crossref","unstructured":"Pintea, S.L., van Gemert, J.C., Smeulders, A.W.M. (2014). Dejavu: Motion prediction in static images. In: ECCV, Springer.","DOI":"10.1007\/978-3-319-10578-9_12"},{"key":"1334_CR36","unstructured":"Radford, A., Metz, L., Chintala, S. (2015). Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434."},{"key":"1334_CR37","doi-asserted-by":"crossref","unstructured":"Saito, M., Matsumoto, E., Saito, S. (2017). Temporal generative adversarial nets with singular value clipping. In: ICCV, IEEE.","DOI":"10.1109\/ICCV.2017.308"},{"key":"1334_CR38","doi-asserted-by":"crossref","unstructured":"Shrivastava, A., Pfister, T., Tuzel, O., Susskind, J., Wang, W., Webb, R. (2017). Learning from simulated and unsupervised images through adversarial training. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2017.241"},{"key":"1334_CR39","unstructured":"Sohn, K., Lee, H., Yan, X. (2015). Learning structured output representation using deep conditional generative models. In: NIPS, pp 3483\u20133491."},{"key":"1334_CR40","unstructured":"Soomro, K., Zamir, A.R., Shah, M. (2012). UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402"},{"key":"1334_CR41","unstructured":"Srivastava N, Mansimov E, Salakhudinov R (2015) Unsupervised learning of video representations using LSTMs. In: ICML, pp 843\u2013852"},{"key":"1334_CR42","doi-asserted-by":"crossref","unstructured":"Sun D, Yang X, Liu MY, Kautz J (2018) PWC-Net: CNNs for optical flow using pyramid, warping, and cost volume. In: CVPR","DOI":"10.1109\/CVPR.2018.00931"},{"key":"1334_CR43","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M.Y., Yang, X., Kautz, J. (2018). MoCoGAN: Decomposing motion and content for video generation. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2018.00165"},{"issue":"1","key":"1334_CR44","first-page":"7184","volume":"17","author":"B Uria","year":"2016","unstructured":"Uria, B., C\u00f4t\u00e9, M. A., Gregor, K., Murray, I., & Larochelle, H. (2016). Neural Autoregressive Distribution Estimation. JLMR, 17(1), 7184\u20137220.","journal-title":"Neural Autoregressive Distribution Estimation. JLMR"},{"key":"1334_CR45","unstructured":"Villegas, R., Yang, J., Hong, S., Lin, X., Lee, H. (2017a). Decomposing motion and content for natural video sequence prediction. arXiv preprint arXiv:1706.08033"},{"key":"1334_CR46","unstructured":"Villegas, R., Yang, J., Zou, Y., Sohn, S., Lin, X., Lee , H. (2017b). Learning to generate long-term future via hierarchial prediction. In: ICML."},{"key":"1334_CR47","doi-asserted-by":"crossref","unstructured":"Vondrick, C., Torralba, A. (2017). Generating the future with adversarial transformers. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2017.319"},{"key":"1334_CR48","doi-asserted-by":"crossref","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A. (2016a). Anticipating visual representations from unlabeled video. In: CVPR, IEEE, pp 98\u2013106.","DOI":"10.1109\/CVPR.2016.18"},{"key":"1334_CR49","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A. (2016b). Generating videos with scene dynamics. In: NIPS, pp 613\u2013621."},{"key":"1334_CR50","doi-asserted-by":"crossref","unstructured":"Walker, J., Doersch, C., Gupta, A., Hebert, M. (2016). An uncertain future: Forecasting from static images using variational autoencoders. In: ECCV, Springer, pp 835\u2013851","DOI":"10.1007\/978-3-319-46478-7_51"},{"key":"1334_CR51","doi-asserted-by":"crossref","unstructured":"Walker, J., Gupta, A., Hebert, M. (2014). Patch to the future: Unsupervised visual prediction. In: CVPR, IEEE, pp 3302\u20133309.","DOI":"10.1109\/CVPR.2014.416"},{"key":"1334_CR52","doi-asserted-by":"crossref","unstructured":"Walker, J., Gupta, A., Hebert, M. (2015). Dense optical flow prediction from a static image. In: ICCV, IEEE, pp 2443\u20132451.","DOI":"10.1109\/ICCV.2015.281"},{"key":"1334_CR53","unstructured":"Wang, T.C., Liu, M.Y., Zhu, J.Y., Liu, G., Tao, A., Kautz, J., Catanzaro, B. (2018a). Video-to-video synthesis. In: NeurIPS."},{"key":"1334_CR54","doi-asserted-by":"crossref","unstructured":"Wang, T.C., Liu, M.Y., Zhu, J.Y., Tao, A., Kautz, J., Catanzaro, B. (2018b). High-resolution image synthesis and semantic manipulation with conditional GANs. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2018.00917"},{"issue":"4","key":"1334_CR55","first-page":"600","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). Image quality assessment: from error visibility to structural similarity. TIP, 13(4), 600\u2013612.","journal-title":"TIP"},{"key":"1334_CR56","unstructured":"Xue, T., Chen, B., Wu, J., Wei, D., Freeman, W.T. (2017). Video enhancement with task-oriented flow. arXiv preprint arXiv:1711.09078."},{"key":"1334_CR57","unstructured":"Xue, T., Wu, J., Bouman, K., Freeman, B. (2016). Visual dynamics: Probabilistic future frame synthesis via cross convolutional networks. In: NIPS, pp 91\u201399."},{"key":"1334_CR58","doi-asserted-by":"crossref","unstructured":"Yin, Z., Shi, J. (2018). Geonet: Unsupervised learning of dense depth, optical flow and camera pose. In: CVPR, IEEE.","DOI":"10.1109\/CVPR.2018.00212"},{"key":"1334_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, H., Xu, T., Li, H., Zhang, S., Wang, X., Huang, X., Metaxas, D.N. (2017). StackGAN: Text to photo-realistic image synthesis with stacked generative adversarial networks. In: ICCV, pp 5907\u20135915.","DOI":"10.1109\/ICCV.2017.629"},{"key":"1334_CR60","doi-asserted-by":"crossref","unstructured":"Zhao, L., Peng, X., Tian, Y., Kapadia, M., Metaxas, D. (2018). Learning to forecast and refine residual motion for image-to-video generation. In: ECCV, Springer.","DOI":"10.1007\/978-3-030-01267-0_24"},{"key":"1334_CR61","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Zheng, L., Yang, Y. (2017). Unlabeled samples generated by gan improve the person re-identification baseline in vitro. In: ICCV, pp 3754\u20133762.","DOI":"10.1109\/ICCV.2017.405"},{"key":"1334_CR62","doi-asserted-by":"crossref","unstructured":"Zhou, T., Tulsiani, S., Sun, W., Malik, J., Efros, A.A. (2016). View synthesis by appearance flow. In: ECCV, Springer, pp 286\u2013301.","DOI":"10.1007\/978-3-319-46493-0_18"},{"key":"1334_CR63","doi-asserted-by":"crossref","unstructured":"Zhu, J.Y., Park, T., Isola, P., Efros, A.A. (2017). Unpaired image-to-image translation using cycle-consistent adversarial networks. In: ICCV, IEEE.","DOI":"10.1109\/ICCV.2017.244"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-020-01334-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-020-01334-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-020-01334-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,5,27]],"date-time":"2021-05-27T23:43:31Z","timestamp":1622159011000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-020-01334-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5,28]]},"references-count":63,"journal-issue":{"issue":"10-11","published-print":{"date-parts":[[2020,11]]}},"alternative-id":["1334"],"URL":"https:\/\/doi.org\/10.1007\/s11263-020-01334-x","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,5,28]]},"assertion":[{"value":"15 May 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 April 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 May 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}