{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T13:30:53Z","timestamp":1742995853783,"version":"3.40.3"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031781711"},{"type":"electronic","value":"9783031781728"}],"license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78172-8_10","type":"book-chapter","created":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T09:47:19Z","timestamp":1733132839000},"page":"146-161","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Make an Image Move: Few-Shot Based Video Generation Guided by\u00a0CLIP"],"prefix":"10.1007","author":[{"given":"Yonglong","family":"Huang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nannan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fuqin","family":"Deng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruiquan","family":"Ge","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changmiao","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","key":"10_CR1","DOI":"10.1109\/ICCV51070.2023.00355"},{"unstructured":"Ye, H., Zhang, J., Liu, S., Han, X., Yang, W.: Text compatible image prompt adapter for text-to-image diffusion models. IP-adapter (2023)","key":"10_CR2"},{"doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","key":"10_CR3","DOI":"10.1109\/CVPR52688.2022.01042"},{"doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22563\u201322575 (2023)","key":"10_CR4","DOI":"10.1109\/CVPR52729.2023.02161"},{"unstructured":"Guo, Y., et al.: AnimateDiff: animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)","key":"10_CR5"},{"doi-asserted-by":"crossref","unstructured":"Zhang, D.J., et al.: Show-1: marrying pixel and latent diffusion models for text-to-video generation. arXiv preprint arXiv:2309.15818 2023","key":"10_CR6","DOI":"10.1007\/s11263-024-02271-9"},{"unstructured":"Hong, W., Ding, M., Zheng, W., Liu, X., Tang, J.: CogVideo: large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868 (2022)","key":"10_CR7"},{"doi-asserted-by":"crossref","unstructured":"Esser, P., Chiu, J., Atighehchian, P., Granskog, J., Germanidis, A.: Structure and content-guided video synthesis with diffusion models. arXiv preprint arXiv:2302.03011 (2023)","key":"10_CR8","DOI":"10.1109\/ICCV51070.2023.00675"},{"doi-asserted-by":"crossref","unstructured":"Khachatryan, L., et al.: Text2Video-Zero: text-to-image diffusion models are Zero-Shot video generators. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15954\u201315964 (2023)","key":"10_CR9","DOI":"10.1109\/ICCV51070.2023.01462"},{"unstructured":"Huang, H., Feng, Y., Shi, C., Xu, L., Yu, J., Yang, S.: Free-bloom: zero-shot text-to-video generator with LLM director and LDM animator. In: Conference on Neural Information Processing Systems (2023)","key":"10_CR10"},{"doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7623\u20137633 (2023)","key":"10_CR11","DOI":"10.1109\/ICCV51070.2023.00701"},{"doi-asserted-by":"crossref","unstructured":"Wu, R., Chen, L., Yang, T., Guo, C., Li, C., Zhang, X.: LAMP: learn a motion pattern by few-shot tuning a text-to-image diffusion model. arXiv preprint arXiv:2310.10769 (2023)","key":"10_CR12","DOI":"10.1109\/CVPR52733.2024.00677"},{"unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)","key":"10_CR13"},{"unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)","key":"10_CR14"},{"doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","key":"10_CR15","DOI":"10.1109\/CVPR46437.2021.01268"},{"doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: StackGAN: text to photo-realistic image synthesis with stacked generative adversarial networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5907\u20135915 (2017)","key":"10_CR16","DOI":"10.1109\/ICCV.2017.629"},{"unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational Bayes. arXiv preprint arXiv:1312.6114 (2013)","key":"10_CR17"},{"unstructured":"Van Den Oord, A., Vinyals, O., et al.: Neural discrete representation learning. In: Advances in Neural Information Processing Systems, vol. 30 (2017)","key":"10_CR18"},{"unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)","key":"10_CR19"},{"unstructured":"Nichol, A.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)","key":"10_CR20"},{"unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.06125 (2022)","key":"10_CR21"},{"unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Advances in Neural Information Processing Systems, vol. 35, pp. 36479\u201336494 (2022)","key":"10_CR22"},{"doi-asserted-by":"crossref","unstructured":"Gu, S., et al.: Vector quantized diffusion model for text-to-image synthesis. In: CVPR, pp. 10696\u201310706 (2022)","key":"10_CR23","DOI":"10.1109\/CVPR52688.2022.01043"},{"unstructured":"Ho, J., et al.: Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)","key":"10_CR24"},{"unstructured":"Singer, U., et al.: Make-a-video: text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)","key":"10_CR25"},{"unstructured":"Zhou, D., Wang, W., Yan, H., Lv, W., Zhu, Y., Feng, J.: MagicVideo: efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018 (2022)","key":"10_CR26"},{"doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","key":"10_CR27","DOI":"10.1109\/ICCV48922.2021.00175"},{"unstructured":"Hong, S., Seo, J., Hong, S., Shin, H., Kim, S.: Large language models are frame-level directors for zero-shot text-to-video generation. arXiv preprint arXiv:2305.14330 (2023)","key":"10_CR28"},{"doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, Y., Li, W., Lin, Z., Jia, J.: Video-P2P: video editing with cross-attention control. arXiv preprint arXiv:2303.04761 (2023)","key":"10_CR29","DOI":"10.1109\/CVPR52733.2024.00821"},{"doi-asserted-by":"crossref","unstructured":"Qi, C.: FateZero: fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535 (2023)","key":"10_CR30","DOI":"10.1109\/ICCV51070.2023.01460"},{"doi-asserted-by":"crossref","unstructured":"Mikko, K., et al.: Multilayer networks. J. Complex Netw. 2(3), 203\u2013271 (2014)","key":"10_CR31","DOI":"10.1093\/comnet\/cnu016"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78172-8_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T10:05:36Z","timestamp":1733133936000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78172-8_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"ISBN":["9783031781711","9783031781728"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78172-8_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"3 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}