{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T09:28:18Z","timestamp":1780392498297,"version":"3.54.1"},"publisher-location":"Cham","reference-count":84,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729911","type":"print"},{"value":"9783031729928","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72992-8_16","type":"book-chapter","created":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:29:02Z","timestamp":1730190542000},"page":"273-290","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":51,"title":["MotionDirector: Motion Customization of\u00a0Text-to-Video Diffusion Models"],"prefix":"10.1007","author":[{"given":"Rui","family":"Zhao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuchao","family":"Gu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jay Zhangjie","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"David Junhao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jia-Wei","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Weijia","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jussi","family":"Keppo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,30]]},"reference":[{"key":"16_CR1","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"16_CR2","doi-asserted-by":"crossref","unstructured":"Balaji, Y., Min, M.R., Bai, B., Chellappa, R., Graf, H.P.: Conditional GAN with discriminative filter generation for text-to-video synthesis. In: IJCAI, vol.\u00a01, p.\u00a02 (2019)","DOI":"10.24963\/ijcai.2019\/276"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 22563\u201322575 (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"16_CR4","unstructured":"Chen, T.S., Lin, C.H., Tseng, H.Y., Lin, T.Y., Yang, M.H.: Motion-conditioned diffusion model for controllable video synthesis (2023). arXiv:2304.14404"},{"key":"16_CR5","unstructured":"Chen, W., et al.: Control-A-Video: controllable text-to-video generation with diffusion models. arXiv preprint arXiv:2305.13840 (2023)"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., Huang, L., Liu, Y., Shen, Y., Zhao, D., Zhao, H.: AnyDoor: zero-shot object-level image customization. arXiv preprint arXiv:2307.09481 (2023)","DOI":"10.1109\/CVPR52733.2024.00630"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"16_CR8","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: Advances in Neural Information Processing Systems, vol. 34, pp. 8780\u20138794 (2021)"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Duan, Z., et al.: DiffSynth: latent in-iteration deflickering for realistic video synthesis (2023). arXiv:2308.03463","DOI":"10.1007\/978-3-031-70381-2_21"},{"key":"16_CR10","doi-asserted-by":"crossref","unstructured":"Esser, P., Chiu, J., Atighehchian, P., Granskog, J., Germanidis, A.: Structure and content-guided video synthesis with diffusion models. arXiv preprint arXiv:2302.03011 (2023)","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"Ge, S., et al.: Long video generation with time-agnostic VQGAN and time-sensitive transformer. arXiv preprint arXiv:2204.03638 (2022)","DOI":"10.1007\/978-3-031-19790-1_7"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Ge, S., et al.: Preserve your own correlation: a noise prior for video diffusion models (2023). arXiv:2305.10474","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"16_CR13","unstructured":"Gu, Y., et\u00a0al.: Mix-of-Show: decentralized low-rank adaptation for multi-concept customization of diffusion models. arXiv preprint arXiv:2305.18292 (2023)"},{"key":"16_CR14","unstructured":"Guo, Y., et al.: AnimateDiff: animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)"},{"key":"16_CR15","unstructured":"He, Y., et\u00a0al.: Animate-A-Story: storytelling with retrieval-augmented video generation (2023). arXiv:2307.06940"},{"key":"16_CR16","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y., Chen, Q.: Latent video diffusion models for high-fidelity long video generation (2022)"},{"key":"16_CR17","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y., Chen, Q.: Latent video diffusion models for high-fidelity video generation with arbitrary lengths (2022). arXiv:2211.13221"},{"key":"16_CR18","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y., Chen, Q.: VideoCrafter: a toolkit for text-to-video generation and editing (2023). https:\/\/github.com\/AILab-CVC\/VideoCrafter"},{"key":"16_CR19","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: CLIPScore: a reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"16_CR20","unstructured":"Ho, J., et\u00a0al.: Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)"},{"key":"16_CR21","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"16_CR22","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. In: NeurIPS (2022)"},{"key":"16_CR23","unstructured":"Hong, S., Seo, J., Hong, S., Shin, H., Kim, S.: Large language models are frame-level directors for zero-shot text-to-video generation (2023). arXiv:2305.14330"},{"key":"16_CR24","unstructured":"Hong, W., Ding, M., Zheng, W., Liu, X., Tang, J.: CogVideo: large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868 (2022)"},{"key":"16_CR25","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"16_CR26","unstructured":"Huang, H., Feng, Y., Shi, C., Xu, L., Yu, J., Yang, S.: Free-Bloom: zero-shot text-to-video generator with LLM director and LDM animator. In: NeurIPS (2023)"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Jeong, H., Park, G.Y., Ye, J.C.: VMC: video motion customization using temporal attention adaption for text-to-video diffusion models. arXiv preprint arXiv:2312.00845 (2023)","DOI":"10.1109\/CVPR52733.2024.00880"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Karras, J., Holynski, A., Wang, T.C., Kemelmacher-Shlizerman, I.: DreamPose: fashion image-to-video synthesis via stable diffusion (2023). arXiv:2304.06025","DOI":"10.1109\/ICCV51070.2023.02073"},{"key":"16_CR29","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., et al.: Text2Video-Zero: text-to-image diffusion models are zero-shot video generators. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"16_CR30","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"16_CR31","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., Levy, O.: Pick-a-Pic: an open dataset of user preferences for text-to-image generation. arXiv preprint arXiv:2305.01569 (2023)"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Zhang, R., Shechtman, E., Zhu, J.Y.: Multi-concept customization of text-to-image diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1931\u20131941 (2023)","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"16_CR33","unstructured":"Le Moing, G., Ponce, J., Schmid, C.: CCVS: context-aware controllable video synthesis. In: Advances in Neural Information Processing Systems, vol. 34, pp. 14042\u201314055 (2021)"},{"key":"16_CR34","unstructured":"Li, X., et al.: VideoGen: a reference-guided latent diffusion approach for high definition text-to-video generation. arXiv preprint arXiv:2309.00398 (2023)"},{"key":"16_CR35","unstructured":"Lian, L., Shi, B., Yala, A., Darrell, T., Li, B.: LLM-grounded video diffusion models (2023). arXiv:2309.17444"},{"key":"16_CR36","doi-asserted-by":"crossref","unstructured":"Luo, Z., et al.: VideoFusion: decomposed diffusion models for high-quality video generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00984"},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"Ma, Y., et al.: Follow Your Pose: pose-guided text-to-video generation using pose-free videos. arXiv preprint arXiv:2304.01186 (2023)","DOI":"10.1609\/aaai.v38i5.28206"},{"key":"16_CR38","unstructured":"Materzynska, J., Sivic, J., Shechtman, E., Torralba, A., Zhang, R., Russell, B.: Customizing motion in text-to-video diffusion models. arXiv preprint arXiv:2312.04966 (2023)"},{"key":"16_CR39","doi-asserted-by":"crossref","unstructured":"Mei, K., Patel, V.: VIDM: video implicit diffusion models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 9117\u20139125 (2023)","DOI":"10.1609\/aaai.v37i8.26094"},{"key":"16_CR40","doi-asserted-by":"crossref","unstructured":"Ni, H., Shi, C., Li, K., Huang, S.X., Min, M.R.: Conditional image-to-video generation with latent flow diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18444\u201318455 (2023)","DOI":"10.1109\/CVPR52729.2023.01769"},{"key":"16_CR41","unstructured":"Qin, B., Ye, W., Yu, Q., Tang, S., Zhuang, Y.: Dancing Avatar: pose and text-guided human motion videos synthesis with image diffusion model (2023). arXiv:2308.07749"},{"key":"16_CR42","doi-asserted-by":"crossref","unstructured":"Ren, Y., et al.: Customize-A-Video: one-shot motion customization of text-to-video diffusion models. arXiv preprint arXiv:2402.14780 (2024)","DOI":"10.1007\/978-3-031-73024-5_20"},{"key":"16_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"16_CR44","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"16_CR45","unstructured":"Ryu, S.: Low-rank adaptation for fast text-to-image diffusion fine-tuning (2023). https:\/\/github.com\/cloneofsimo\/lora"},{"key":"16_CR46","doi-asserted-by":"crossref","unstructured":"Saito, M., Matsumoto, E., Saito, S.: Temporal generative adversarial nets with singular value clipping. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2830\u20132839 (2017)","DOI":"10.1109\/ICCV.2017.308"},{"key":"16_CR47","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. In: Advances in Neural Information Processing Systems, vol. 35, pp. 25278\u201325294 (2022)"},{"key":"16_CR48","doi-asserted-by":"crossref","unstructured":"Shen, X., Li, X., Elhoseiny, M.: MoStGAN-V: video generation with temporal motion styles. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5652\u20135661 (2023)","DOI":"10.1109\/CVPR52729.2023.00547"},{"key":"16_CR49","unstructured":"Singer, U., et\u00a0al.: Make-A-Video: text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)"},{"key":"16_CR50","doi-asserted-by":"crossref","unstructured":"Skorokhodov, I., Tulyakov, S., Elhoseiny, M.: StyleGAN-V: a continuous video generator with the price, image quality and perks of StyleGAN2. arXiv preprint arXiv:2112.14683 (2021)","DOI":"10.1109\/CVPR52688.2022.00361"},{"key":"16_CR51","unstructured":"Smith, J.S., et al.: Continual Diffusion: continual customization of text-to-image diffusion with C-LoRA. arXiv preprint arXiv:2304.06027 (2023)"},{"key":"16_CR52","series-title":"Advances in Computer Vision and Pattern Recognition","doi-asserted-by":"publisher","first-page":"181","DOI":"10.1007\/978-3-319-09396-3_9","volume-title":"Computer Vision in Sports","author":"K Soomro","year":"2014","unstructured":"Soomro, K., Zamir, A.R.: Action recognition in realistic sports videos. In: Moeslund, T.B., Thomas, G., Hilton, A. (eds.) Computer Vision in Sports. ACVPR, pp. 181\u2013208. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-09396-3_9"},{"key":"16_CR53","unstructured":"Srivastava, N., Mansimov, E., Salakhudinov, R.: Unsupervised learning of video representations using LSTMs. In: International Conference on Machine Learning, pp. 843\u2013852. PMLR (2015)"},{"key":"16_CR54","unstructured":"Sterling, S.: Zeroscope (2023). https:\/\/huggingface.co\/cerspense\/zeroscope_v2_576w"},{"key":"16_CR55","unstructured":"Tian, Y., et al.: A good image generator is what you need for high-resolution video synthesis. In: International Conference on Learning Representations (2020)"},{"key":"16_CR56","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M.Y., Yang, X., Kautz, J.: MoCoGAN: decomposing motion and content for video generation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1526\u20131535 (2018)","DOI":"10.1109\/CVPR.2018.00165"},{"key":"16_CR57","unstructured":"Voleti, V., Jolicoeur-Martineau, A., Pal, C.: MCVD-masked conditional video diffusion for prediction, generation, and interpolation. In: Advances in Neural Information Processing Systems, vol. 35, pp. 23371\u201323385 (2022)"},{"key":"16_CR58","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"16_CR59","unstructured":"Wang, J., Yuan, H., Chen, D., Zhang, Y., Wang, X., Zhang, S.: ModelScope text-to-video technical report. arXiv preprint arXiv:2308.06571 (2023)"},{"key":"16_CR60","unstructured":"Wang, T., et al.: DisCo: disentangled control for referring human dance generation in real world (2023). arXiv:2307.00040"},{"key":"16_CR61","unstructured":"Wang, W., et al.: VideoFactory: swap attention in spatiotemporal diffusions for text-to-video generation (2023). arXiv:2305.10874"},{"key":"16_CR62","unstructured":"Wang, X., et al.: VideoComposer: compositional video synthesis with motion controllability. arXiv preprint arXiv:2306.02018 (2023)"},{"key":"16_CR63","unstructured":"Wang, Y., et al.: LAVIE: high-quality video generation with cascaded latent diffusion models (2023). arXiv:2309.15103. https:\/\/api.semanticscholar.org\/CorpusID:262823915"},{"key":"16_CR64","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: MotionCtrl: a unified and flexible motion controller for video generation. arXiv preprint arXiv:2312.03641 (2023)","DOI":"10.1145\/3641519.3657518"},{"key":"16_CR65","doi-asserted-by":"crossref","unstructured":"Wei, Y., et al.: DreamVideo: composing your dream videos with customized subject and motion. arXiv preprint arXiv:2312.04433 (2023)","DOI":"10.1109\/CVPR52733.2024.00625"},{"key":"16_CR66","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhang, Y., Ji, Z., Bai, J., Zhang, L., Zuo, W.: ELITE: encoding visual concepts into textual embeddings for customized text-to-image generation. arXiv preprint arXiv:2302.13848 (2023)","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"16_CR67","unstructured":"Wu, J.Z., et al.: The text-guided video editing benchmark at LOVEU 2023 (2023). https:\/\/sites.google.com\/view\/loveucvpr23\/track4"},{"key":"16_CR68","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-A-Video: one-shot tuning of image diffusion models for text-to-video generation. arXiv preprint arXiv:2212.11565 (2022)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"16_CR69","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-A-Video: one-shot tuning of image diffusion models for text-to-video generation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"16_CR70","unstructured":"Wu, J.Z., et\u00a0al.: CVPR 2023 text guided video editing competition. arXiv preprint arXiv:2310.16003 (2023)"},{"key":"16_CR71","doi-asserted-by":"crossref","unstructured":"Wu, R., Chen, L., Yang, T., Guo, C., Li, C., Zhang, X.: LAMP: learn a motion pattern for few-shot-based video generation. arXiv preprint arXiv:2310.10769 (2023)","DOI":"10.1109\/CVPR52733.2024.00677"},{"key":"16_CR72","doi-asserted-by":"crossref","unstructured":"Xing, J., et\u00a0al.: Make-Your-Video: customized video generation using textual and structural guidance (2023). arXiv:2306.00943","DOI":"10.1109\/TVCG.2024.3365804"},{"key":"16_CR73","unstructured":"Xing, Z., et al.: A survey on video diffusion models. arXiv preprint arXiv:2310.10647 (2023)"},{"key":"16_CR74","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: VideoGPT: video generation using VQ-VAE and transformers. arXiv preprint arXiv:2104.10157 (2021)"},{"key":"16_CR75","doi-asserted-by":"crossref","unstructured":"Yang, S., et al.: Direct-a-Video: customized video generation with user-directed camera movement and object motion. arXiv preprint arXiv:2402.03162 (2024)","DOI":"10.1145\/3641519.3657481"},{"key":"16_CR76","unstructured":"Yin, S., et al.: DragNUWA: fine-grained control in video generation by integrating text, image, and trajectory (2023). arXiv:2308.08089"},{"key":"16_CR77","doi-asserted-by":"crossref","unstructured":"Yin, S., et\u00a0al.: NUWA-XL: diffusion over diffusion for extremely long video generation (2023). arXiv:2303.12346","DOI":"10.18653\/v1\/2023.acl-long.73"},{"key":"16_CR78","doi-asserted-by":"crossref","unstructured":"Yu, S., Sohn, K., Kim, S., Shin, J.: Video probabilistic diffusion models in projected latent space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18456\u201318466 (2023)","DOI":"10.1109\/CVPR52729.2023.01770"},{"key":"16_CR79","unstructured":"Yu, S., et al.: Generating videos with dynamics-aware implicit generative adversarial networks. In: International Conference on Learning Representations (2021)"},{"key":"16_CR80","doi-asserted-by":"crossref","unstructured":"Zhang, D.J., et al.: Show-1: marrying pixel and latent diffusion models for text-to-video generation (2023). arXiv:2309.15818","DOI":"10.1007\/s11263-024-02271-9"},{"key":"16_CR81","doi-asserted-by":"crossref","unstructured":"Zhang, L., Agrawala, M.: Adding conditional control to text-to-image diffusion models (2023). arXiv preprint arXiv:2302.05543","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"16_CR82","unstructured":"Zhao, R., et al.: MotionDirector: motion customization of text-to-video diffusion models. arXiv preprint arXiv:2310.08465 (2023)"},{"key":"16_CR83","unstructured":"Zhao, S., et al.: Uni-ControlNet: all-in-one control to text-to-image diffusion models. arXiv preprint arXiv:2305.16322 (2023)"},{"key":"16_CR84","unstructured":"Zhou, D., Wang, W., Yan, H., Lv, W., Zhu, Y., Feng, J.: MagicVideo: efficient video generation with latent diffusion models (2022). arXiv:2211.11018"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72992-8_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T12:05:40Z","timestamp":1732968340000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72992-8_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,30]]},"ISBN":["9783031729911","9783031729928"],"references-count":84,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72992-8_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,30]]},"assertion":[{"value":"30 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}