{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:43:57Z","timestamp":1777657437577,"version":"3.51.4"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729423","type":"print"},{"value":"9783031729430","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72943-0_9","type":"book-chapter","created":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T13:40:50Z","timestamp":1732801250000},"page":"148-165","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Hybrid Video Diffusion Models with\u00a02D Triplane and\u00a03D Wavelet Representation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-9506-9966","authenticated-orcid":false,"given":"Kihong","family":"Kim","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6748-7066","authenticated-orcid":false,"given":"Haneol","family":"Lee","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5250-7512","authenticated-orcid":false,"given":"Jihye","family":"Park","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6426-5433","authenticated-orcid":false,"given":"Seyeon","family":"Kim","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5606-0761","authenticated-orcid":false,"given":"Kwanghee","family":"Lee","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2927-6273","authenticated-orcid":false,"given":"Seungryong","family":"Kim","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5252-9668","authenticated-orcid":false,"given":"Jaejun","family":"Yoo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,29]]},"reference":[{"key":"9_CR1","unstructured":"Ajay, A., et al.: Compositional foundation models for hierarchical planning. arXiv preprint arXiv:2309.08587 (2023)"},{"key":"9_CR2","unstructured":"An, J., et al.: Latent-shift: latent diffusion with temporal shift for efficient text-to-video generation. arXiv preprint arXiv:2304.08477 (2023)"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Anciukevi\u010dius, T., et al.: RenderDiffusion: image diffusion for 3D reconstruction, inpainting and generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12608\u201312618 (2023)","DOI":"10.1109\/CVPR52729.2023.01213"},{"key":"9_CR4","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML (2021)"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22563\u201322575 (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"9_CR6","doi-asserted-by":"crossref","unstructured":"Ceylan, D., Huang, C.H.P., Mitra, N.J.: Pix2Video: video editing using image diffusion. arXiv preprint arXiv:2303.12688 (2023)","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Chan, E.R., et\u00a0al.: Efficient geometry-aware 3D generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16123\u201316133 (2022)","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Chan, E.R., Monteiro, M., Kellnhofer, P., Wu, J., Wetzstein, G.: Pi-GAN: periodic implicit generative adversarial networks for 3D-aware image synthesis. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00574"},{"key":"9_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"424","DOI":"10.1007\/978-3-319-46723-8_49","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2016","author":"\u00d6 \u00c7i\u00e7ek","year":"2016","unstructured":"\u00c7i\u00e7ek, \u00d6., Abdulkadir, A., Lienkamp, S.S., Brox, T., Ronneberger, O.: 3D U-Net: learning dense volumetric segmentation from sparse annotation. In: Ourselin, S., Joskowicz, L., Sabuncu, M.R., Unal, G., Wells, W. (eds.) MICCAI 2016. LNCS, vol. 9901, pp. 424\u2013432. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46723-8_49"},{"key":"9_CR10","unstructured":"Dai, Y., et al.: Learning universal policies via text-guided video generation. arXiv preprint arXiv:2302.00111 (2023)"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Esser, P., Chiu, J., Atighehchian, P., Granskog, J., Germanidis, A.: Structure and content-guided video synthesis with diffusion models. arXiv preprint arXiv:2302.03011 (2023)","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"9_CR12","unstructured":"Gu, J., et al.: NerfDiff: single-image view synthesis with nerf-guided distillation from 3D-aware diffusion. In: International Conference on Machine Learning, pp. 11808\u201311826. PMLR (2023)"},{"key":"9_CR13","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y., Chen, Q.: Latent video diffusion models for high-fidelity video generation with arbitrary lengths. arXiv preprint arXiv:2211.13221 (2022)"},{"key":"9_CR14","unstructured":"Ho, J., et al.: Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)"},{"key":"9_CR15","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems (2020)"},{"key":"9_CR16","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models (2022). https:\/\/arxiv.org\/abs\/2204.03458"},{"key":"9_CR17","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. arXiv preprint arXiv:2204.03458 (2022)"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Hore, A., Ziou, D.: Image quality metrics: PSNR vs. SSIM. In: 2010 20th International Conference on Pattern Recognition, pp. 2366\u20132369. IEEE (2010)","DOI":"10.1109\/ICPR.2010.579"},{"key":"9_CR19","unstructured":"Hu, Y., Chen, Z., Luo, C.: LaMD: latent motion diffusion for video generation. arXiv preprint arXiv:2304.11603 (2023)"},{"issue":"13","key":"9_CR20","doi-asserted-by":"publisher","first-page":"800","DOI":"10.1049\/el:20080522","volume":"44","author":"Q Huynh-Thu","year":"2008","unstructured":"Huynh-Thu, Q., Ghanbari, M.: Scope of validity of PSNR in image\/video quality assessment. Electron. Lett. 44(13), 800\u2013801 (2008)","journal-title":"Electron. Lett."},{"key":"9_CR21","unstructured":"Kalchbrenner, N., et al.: Video pixel networks. In: ICML (2017)"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"9_CR23","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., et al.: Text2Video-Zero: text-to-image diffusion models are zero-shot video generators. arXiv preprint arXiv:2303.13439 (2023)","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"9_CR24","doi-asserted-by":"crossref","unstructured":"Li, Y., Min, M., Shen, D., Carlson, D., Carin, L.: Video generation from text. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.12233"},{"key":"9_CR25","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, Y., Li, W., Lin, Z., Jia, J.: Video-P2P: video editing with cross-attention control. arXiv preprint arXiv:2303.04761 (2023)","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"9_CR26","unstructured":"Ma, X., et al.: Latte: latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048 (2024)"},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Mei, K., Patel, V.M.: VIDM: video implicit diffusion models. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i8.26094"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Pan, Y., Qiu, Z., Yao, T., Li, H., Mei, T.: To create what you tell: generating videos from captions. In: Proceedings of the 25th ACM International Conference on Multimedia, pp. 1789\u20131798 (2017)","DOI":"10.1145\/3123266.3127905"},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Qi, C., et al.: FateZero: fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535 (2023)","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Ruan, L., et al.: MM-Diffusion: learning multi-modal diffusion models for joint audio and video generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10219\u201310228 (2023)","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"9_CR32","unstructured":"Schwarz, K., Liao, Y., Niemeyer, M., Geiger, A.: GRAF: generative radiance fields for 3D-aware image synthesis. In: Advances in Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Secker, A., Taubman, D.: Highly scalable video compression using a lifting-based 3D wavelet transform with deformable mesh motion compensation. In: Proceedings of the International Conference on Image Processing, vol.\u00a03, pp. 749\u2013752. IEEE (2002)","DOI":"10.1109\/ICIP.2002.1039080"},{"key":"9_CR34","unstructured":"Siarohin, A., Lathuili\u00e8re, S., Tulyakov, S., Ricci, E., Sebe, N.: First order motion model for image animation. In: NeurIPS (2019)"},{"key":"9_CR35","unstructured":"Singer, U., et\u00a0al.: Make-a-video: text-to-video generation without text-video data. In: ICLR (2023)"},{"key":"9_CR36","doi-asserted-by":"crossref","unstructured":"Skorokhodov, I., Tulyakov, S., Elhoseiny, M.: StyleGAN-V: a continuous video generator with the price, image quality and perks of StyleGAN2. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3626\u20133636 (2022)","DOI":"10.1109\/CVPR52688.2022.00361"},{"key":"9_CR37","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"9_CR38","unstructured":"Tian, Y., et al.: A good image generator is what you need for high-resolution video synthesis. In: ICLR (2021)"},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M.Y., Yang, X., Kautz, J.: MoCoGAN: decomposing motion and content for video generation. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00165"},{"key":"9_CR40","unstructured":"Unterthiner, T., van Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: Towards accurate generative models of video: a new metric & challenges. arXiv preprint arXiv:1812.01717 (2018)"},{"key":"9_CR41","unstructured":"Unterthiner, T., van Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: FVD: a new metric for video generation. In: DGS@ICLR (2019). https:\/\/api.semanticscholar.org\/CorpusID:198489709"},{"key":"9_CR42","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"9_CR43","unstructured":"Voleti, V., Jolicoeur-Martineau, A., Pal, C.: MCVD-masked conditional video diffusion for prediction, generation, and interpolation. In: Advances in Neural Information Processing Systems 35, pp. 23371\u201323385 (2022)"},{"key":"9_CR44","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: NeurIPS (2016)"},{"key":"9_CR45","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/1687-5281-2007-042761","volume":"2007","author":"B Wang","year":"2007","unstructured":"Wang, B., Wang, Y., Selesnick, I., Vetro, A.: Video coding using 3D dual-tree wavelet transform. EURASIP J. Image Video Process. 2007, 1\u201315 (2007)","journal-title":"EURASIP J. Image Video Process."},{"issue":"4","key":"9_CR46","doi-asserted-by":"publisher","first-page":"547","DOI":"10.1109\/42.511757","volume":"15","author":"J Wang","year":"1996","unstructured":"Wang, J., Huang, K.: Medical image compression by using three-dimensional wavelet transformation. IEEE Trans. Med. Imaging 15(4), 547\u2013554 (1996)","journal-title":"IEEE Trans. Med. Imaging"},{"key":"9_CR47","doi-asserted-by":"crossref","unstructured":"Wang, T., et\u00a0al.: RODIN: a generative model for sculpting 3D digital avatars using diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4563\u20134573 (2023)","DOI":"10.1109\/CVPR52729.2023.00443"},{"key":"9_CR48","unstructured":"Weissenborn, D., T\u00e4ckstr\u00f6m, O., Uszkoreit, J.: Scaling autoregressive video models. arXiv preprint arXiv:1906.02634 (2019)"},{"key":"9_CR49","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation. arXiv preprint arXiv:2212.11565 (2022)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"9_CR50","doi-asserted-by":"crossref","unstructured":"Xiong, W., Luo, W., Ma, L., Liu, W., Luo, J.: Learning to generate time-lapse videos using multi-stage dynamic generative adversarial networks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00251"},{"key":"9_CR51","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: VideoGPT: video generation using VQ-VAE and transformers. arXiv preprint arXiv:2104.10157 (2021)"},{"key":"9_CR52","doi-asserted-by":"crossref","unstructured":"Yu, S., Sohn, K., Kim, S., Shin, J.: Video probabilistic diffusion models in projected latent space. arXiv preprint arXiv:2302.07685 (2023)","DOI":"10.1109\/CVPR52729.2023.01770"},{"key":"9_CR53","unstructured":"Yu, S., et al.: Generating videos with dynamics-aware implicit generative adversarial networks. In: The Tenth International Conference on Learning Representations (2022)"},{"key":"9_CR54","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"9_CR55","unstructured":"Zhou, D., Wang, W., Yan, H., Lv, W., Zhu, Y., Feng, J.: MagicVideo: efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018 (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72943-0_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T14:16:45Z","timestamp":1732803405000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72943-0_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,29]]},"ISBN":["9783031729423","9783031729430"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72943-0_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,29]]},"assertion":[{"value":"29 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}