{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:28:36Z","timestamp":1775579316812,"version":"3.50.1"},"reference-count":61,"publisher":"Springer Science and Business Media LLC","issue":"10-11","license":[{"start":{"date-parts":[[2020,5,29]],"date-time":"2020-05-29T00:00:00Z","timestamp":1590710400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,5,29]],"date-time":"2020-05-29T00:00:00Z","timestamp":1590710400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2020,11]]},"DOI":"10.1007\/s11263-020-01333-y","type":"journal-article","created":{"date-parts":[[2020,5,29]],"date-time":"2020-05-29T06:02:31Z","timestamp":1590732151000},"page":"2586-2606","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":74,"title":["Train Sparsely, Generate Densely: Memory-Efficient Unsupervised Training of High-Resolution Temporal GAN"],"prefix":"10.1007","volume":"128","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4200-6585","authenticated-orcid":false,"given":"Masaki","family":"Saito","sequence":"first","affiliation":[]},{"given":"Shunta","family":"Saito","sequence":"additional","affiliation":[]},{"given":"Masanori","family":"Koyama","sequence":"additional","affiliation":[]},{"given":"Sosuke","family":"Kobayashi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,5,29]]},"reference":[{"key":"1333_CR1","unstructured":"Acharya, D., Huang, Z., Paudel, D. P., & Gool, L. V. (2018). Towards high resolution video generation with progressive growing of sliced Wasserstein GANs. Arxiv preprint arXiv:1810.02419"},{"key":"1333_CR2","unstructured":"Akiba, T., Fukuda, K., & Suzuki, S. (2017). ChainerMN: Scalable distributed deep learning framework. In Proceedings of workshop on ML systems in NIPS"},{"key":"1333_CR3","unstructured":"Babaeizadeh, M., Finn, C., Erhan, D., Campbell, R. H. & Levine, S. (2018). Stochastic variational video prediction. In ICLR."},{"key":"1333_CR4","doi-asserted-by":"crossref","unstructured":"Bansal, A., Ma, S., Ramanan, D., & Sheikh, Y. (2018). Recycle-GAN: Unsupervised video retargeting. In ECCV.","DOI":"10.1007\/978-3-030-01228-1_8"},{"key":"1333_CR5","doi-asserted-by":"crossref","unstructured":"Borji, A. (2018). Pros and cons of GAN evaluation measures. Arxiv preprint arXiv:1802.03446.","DOI":"10.1016\/j.cviu.2018.10.009"},{"key":"1333_CR6","unstructured":"Brock, A., Donahue, J., & Simonyan, K. (2018). Large scale GAN training for high fidelity natural image synthesis. Arxiv preprint arXiv:1809.11096."},{"key":"1333_CR7","doi-asserted-by":"crossref","unstructured":"Byeon, W., Wang, Q., Srivastava, R. K., & Koumoutsakos, P. (2018). ContextVP: Fully context-aware video prediction. In ECCV.","DOI":"10.1007\/978-3-030-01270-0_46"},{"key":"1333_CR8","doi-asserted-by":"crossref","unstructured":"Cai, H., Bai, C., Tai, Y. W., & Tang, C. K. (2018). Deep video generation. Prediction and completion of human action sequences. In ECCV.","DOI":"10.1007\/978-3-030-01216-8_23"},{"key":"1333_CR9","unstructured":"Denton, E., Chintala, S., Szlam, A., & Fergus, R. (2015). Deep generative image models using a Laplacian pyramid of adversarial networks. In NIPS."},{"key":"1333_CR10","unstructured":"Denton, E., & Fergus, R. (2018). Stochastic video generation with a learned prior. Arxiv preprint arXiv:1802.07687."},{"key":"1333_CR11","unstructured":"Ebert, F., Finn, C., Lee, A.X. & Levine, S. (2017). Self-supervised visual planning with temporal skip connections. In Conference on robot learning (CoRL)."},{"key":"1333_CR12","unstructured":"Finn, C., Goodfellow, I., & Levine, S. (2016). Unsupervised learning for physical interaction through video prediction. In NIPS."},{"key":"1333_CR13","unstructured":"Glorot, X., & Bengio, Y. (2010). Understanding the difficulty of training deep feedforward neural networks. In AISTATS."},{"key":"1333_CR14","unstructured":"Goodfellow, I. J., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., et al. (2014). Generative adversarial nets. In NIPS."},{"key":"1333_CR15","unstructured":"Gulrajani, I., Ahmed, F., Arjovsky, M., Dumoulin, V., & Courville, A. C. (2017). Improved training of Wasserstein GANs. In NIPS."},{"key":"1333_CR16","doi-asserted-by":"crossref","unstructured":"Hao, Z., Huang, X., & Belongie, S. (2018). Controllable video generation with sparse trajectories. In CVPR.","DOI":"10.1109\/CVPR.2018.00819"},{"key":"1333_CR17","doi-asserted-by":"crossref","unstructured":"Hara, K., Kataoka, H., & Satoh, Y. (2018). Can spatiotemporal 3D CNNs retrace the history of 2D CNNs and ImageNet? In CVPR.","DOI":"10.1109\/CVPR.2018.00685"},{"key":"1333_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"1333_CR19","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., & Hochreiter, S. (2017). GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In NIPS."},{"key":"1333_CR20","doi-asserted-by":"crossref","unstructured":"Huang, X., Liu, M. Y., Belongie, S., & Kautz, J. (2018). Multimodal unsupervised image-to-image translation. In ECCV.","DOI":"10.1007\/978-3-030-01219-9_11"},{"key":"1333_CR21","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J. Y., Zhou, T., & Efros, A. A. (2017). Image-to-image translation with conditional adversarial networks. In CVPR.","DOI":"10.1109\/CVPR.2017.632"},{"key":"1333_CR22","unstructured":"Kalchbrenner, N., van\u00a0den Oord, A., Simonyan, K., Danihelka, I., Vinyals, O., Graves, A., et al. (2016). Video pixel networks. arXiv preprint arXiv:1610.00527."},{"key":"1333_CR23","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Shetty, S., Toderici, G., Sukthankar, R., Leung, T., & Li Fei-Fei. (2014). Large-scale video classification with convolutional neural networks. In CVPR.","DOI":"10.1109\/CVPR.2014.223"},{"key":"1333_CR24","unstructured":"Karras, T., Aila, T., Laine, S., & Lehtinen, J. (2018). Progressive growing of GANs for improved quality, stability, and variation. In ICLR."},{"key":"1333_CR25","unstructured":"Kingma, D., & Ba, J. (2015). Adam: A method for stochastic optimization. In ICLR."},{"key":"1333_CR26","unstructured":"Lee, A. X., Zhang, R., Ebert, F., Abbeel, P., Finn, C., & Levine, S. (2018). Stochastic adversarial video prediction. arXiv preprint arXiv:1804.01523."},{"key":"1333_CR27","doi-asserted-by":"crossref","unstructured":"Li, Y., Fang, C., Yang, J., Wang, Z., Lu, X., & Yang, M. H. (2018). Flow-grounded spatial-temporal video prediction from still images. In ECCV.","DOI":"10.1007\/978-3-030-01240-3_37"},{"key":"1333_CR28","doi-asserted-by":"crossref","unstructured":"Liang, X., Lee, L., Dai, W., & Xing, E. P. (2017). Dual motion GAN for future-flow embedded video prediction. In ICCV.","DOI":"10.1109\/ICCV.2017.194"},{"key":"1333_CR29","unstructured":"Liu, M. Y., Breuel, T., & Kautz, J. (2017). Unsupervised image-to-image translation networks. In NIPS."},{"key":"1333_CR30","doi-asserted-by":"crossref","unstructured":"Liu, Z., Yeh, R. A., Tang, X., Liu, Y., & Agarwala, A. (2017). Video frame synthesis using deep voxel flow. In ICCV.","DOI":"10.1109\/ICCV.2017.478"},{"key":"1333_CR31","unstructured":"Lotter, W., Kreiman, G., & Cox, D. (2017). Deep predictive coding networks for video prediction and unsupervised learning. In ICLR."},{"key":"1333_CR32","unstructured":"Mathieu, M., Couprie, C., & LeCun, Y. (2016). Deep multi-scale video prediction beyond mean square error. In ICLR."},{"key":"1333_CR33","unstructured":"Mescheder, L., Nowozin, S., & Geiger, A. (2018). Which training methods for GANs do actually converge? In ICML."},{"key":"1333_CR34","unstructured":"Miyato, T., Kataoka, T., Koyama, M., & Yoshida, Y. (2018). Spectral normalization for generative adversarial networks. In ICLR."},{"key":"1333_CR35","unstructured":"Miyato, T., & Koyama, M. (2018). cGANs with projection discriminator. In ICLR."},{"key":"1333_CR36","unstructured":"Oh, J., Guo, X., Lee, H., Lewis, R., & Singh, S. (2015). Action-conditional video prediction using deep networks in Atari games. In NIPS."},{"key":"1333_CR37","doi-asserted-by":"crossref","unstructured":"Ohnishi, K., Yamamoto, S., Ushiku, Y., & Harada, T. (2018). Hierarchical video generation from orthogonal information: Optical flow and texture. In AAAI.","DOI":"10.1609\/aaai.v32i1.11881"},{"key":"1333_CR38","volume-title":"Guide to NumPy","author":"TE Oliphant","year":"2015","unstructured":"Oliphant, T. E. (2015). Guide to NumPy (2nd ed.). USA: CreateSpace Independent Publishing Platform.","edition":"2"},{"key":"1333_CR39","unstructured":"Radford, A., Metz, L., & Chintala, S. (2016). Unsupervised representation learning with deep convolutional generative adversarial networks. In ICLR."},{"key":"1333_CR40","unstructured":"Ranzato, M., Szlam, A., Bruna, J., Mathieu, M., Collobert, R., & Chopra, S. (2014). Video (language) modeling: A baseline for generative models of natural videos. arXiv preprint arXiv:1412.6604."},{"key":"1333_CR41","unstructured":"R\u00f6ssler, A., Cozzolino, D., Verdoliva, L., Riess, C., Thies, J., & Nie\u00dfner, M. (2018). FaceForensics: A large-scale video dataset for forgery detection in human faces. arXiv preprint arXiv:1803.09179."},{"key":"1333_CR42","doi-asserted-by":"crossref","unstructured":"Saito, M., Matsumoto, E., & Saito, S. (2017). Temporal generative adversarial nets with singular value clipping. In ICCV.","DOI":"10.1109\/ICCV.2017.308"},{"key":"1333_CR43","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., & Chen, X. (2016). Improved techniques for training GANs. In NIPS."},{"key":"1333_CR44","unstructured":"Shi, X., Chen, Z., Wang, H., Yeung, D. Y., Wong, W. K., & Woo, W. C. (2015). Convolutional LSTM network: A machine learning approach for precipitation nowcasting. In NIPS."},{"key":"1333_CR45","unstructured":"Soomro, K., Zamir, A. R., & Shah, M. (2012). UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402."},{"key":"1333_CR46","unstructured":"Srivastava, N., Mansimov, E., & Salakhutdinov, R. (2015). Unsupervised learning of video representations using LSTMs. In ICML."},{"key":"1333_CR47","unstructured":"Tokui, S., Oono, K., Hido, S., & Clayton, J. (2015). Chainer: A next-generation open source framework for deep learning. In Proceedings of workshop on machine learning systems in NIPS."},{"key":"1333_CR48","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., & Paluri, M. (2015). Learning spatiotemporal features with 3D convolutional networks. In ICCV.","DOI":"10.1109\/ICCV.2015.510"},{"key":"1333_CR49","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M. Y., Yang, X., & Kautz, J. (2018). MoCoGAN: Decomposing motion and content for video generation. In CVPR.","DOI":"10.1109\/CVPR.2018.00165"},{"key":"1333_CR50","unstructured":"Unterthiner, T., van Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., & Gelly, S. (2018). Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:1812.01717."},{"key":"1333_CR51","unstructured":"Vondrick, C., Pirsiavash, H., & Torralba, A. (2016). Generating videos with scene dynamics. In NIPS."},{"key":"1333_CR52","unstructured":"Wang, T. C., Liu, M. Y., Zhu, J. Y., Liu, G., Tao, A., Kautz, J., et al. (2018a). Video-to-video synthesis. arXiv preprint arXiv:1808.06601."},{"key":"1333_CR53","doi-asserted-by":"crossref","unstructured":"Wang, T. C., Liu, M. Y., Zhu, J. Y., Tao, A., Kautz, J., & Catanzaro, B. (2018b). High-resolution image synthesis and semantic manipulation with conditional GANs. In CVPR.","DOI":"10.1109\/CVPR.2018.00917"},{"key":"1333_CR54","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., & He, K. (2018c). Non-local neural networks. In CVPR.","DOI":"10.1109\/CVPR.2018.00813"},{"key":"1333_CR55","doi-asserted-by":"crossref","unstructured":"Yang, C., Wang, Z., Zhu, X., Huang, C., Shi, J., & Lin, D. (2018). Pose guided human video generation. In ECCV.","DOI":"10.1007\/978-3-030-01249-6_13"},{"key":"1333_CR56","unstructured":"Zhang, H., Goodfellow, I., Metaxas, D., & Odena, A. (2018). Self-attention generative adversarial networks. arXiv preprint arXiv:1805.08318."},{"key":"1333_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, H., Xu, T., Li, H., Zhang, S., Wang, X., Huang, X., et al. (2017a). StackGAN++: Realistic image synthesis with stacked generative adversarial networks. arXiv preprint arXiv:1710.10916.","DOI":"10.1109\/ICCV.2017.629"},{"key":"1333_CR58","doi-asserted-by":"crossref","unstructured":"Zhang, H., Xu, T., Li, H., Zhang, S., Wang, X., Huang, X., et al. (2017b). StackGAN: Text to photo-realistic image synthesis with stacked generative adversarial networks. In ICCV.","DOI":"10.1109\/ICCV.2017.629"},{"key":"1333_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Xie, Y., & Yang, L. (2018). Photographic text-to-image synthesis with a hierarchically-nested adversarial network. In CVPR.","DOI":"10.1109\/CVPR.2018.00649"},{"key":"1333_CR60","doi-asserted-by":"crossref","unstructured":"Zhao, L., Peng, X., Tian, Y., Kapadia, M., & Metaxas, D. (2018). Learning to forecast and refine residual motion for image-to-video generation. In ECCV.","DOI":"10.1007\/978-3-030-01267-0_24"},{"key":"1333_CR61","doi-asserted-by":"crossref","unstructured":"Zhu, J. Y., Park, T., Isola, P., & Efros, A. A. (2017). Unpaired image-to-image translation using cycle-consistent adversarial networks. In ICCV.","DOI":"10.1109\/ICCV.2017.244"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-020-01333-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-020-01333-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-020-01333-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,25]],"date-time":"2022-10-25T15:32:33Z","timestamp":1666711953000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-020-01333-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5,29]]},"references-count":61,"journal-issue":{"issue":"10-11","published-print":{"date-parts":[[2020,11]]}},"alternative-id":["1333"],"URL":"https:\/\/doi.org\/10.1007\/s11263-020-01333-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,5,29]]},"assertion":[{"value":"15 May 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 April 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 May 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}