{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T16:25:23Z","timestamp":1782318323247,"version":"3.54.5"},"publisher-location":"Cham","reference-count":54,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726699","type":"print"},{"value":"9783031726705","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72670-5_19","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"331-348","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":49,"title":["DragAnything: Motion Control for\u00a0Anything Using Entity Representation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3912-7212","authenticated-orcid":false,"given":"Weijia","family":"Wu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhuang","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuchao","family":"Gu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rui","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yefei","family":"He","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"David Junhao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yan","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tingting","family":"Gao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Di","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"19_CR1","unstructured":"https:\/\/www.pika.art\/"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Ardino, P., De\u00a0Nadai, M., Lepri, B., Ricci, E., Lathuili\u00e8re, S.: Click to move: controlling video generation with sparse motion. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14749\u201314758 (2021)","DOI":"10.1109\/ICCV48922.2021.01448"},{"key":"19_CR3","unstructured":"Blattmann, A., et\u00a0al.: Stable video diffusion: scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127 (2023)"},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Blattmann, A., Milbich, T., Dorkenwald, M., Ommer, B.: iPOKE: poking a still image for controlled stochastic video synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14707\u201314717 (2021)","DOI":"10.1109\/ICCV48922.2021.01444"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Blattmann, A., Milbich, T., Dorkenwald, M., Ommer, B.: Understanding object dynamics for interactive image-to-video synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5171\u20135181 (2021)","DOI":"10.1109\/CVPR46437.2021.00513"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22563\u201322575 (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Cao, Z., Simon, T., Wei, S.E., Sheikh, Y.: Realtime multi-person 2D pose estimation using part affinity fields. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7291\u20137299 (2017)","DOI":"10.1109\/CVPR.2017.143"},{"key":"19_CR8","unstructured":"Chen, H., et\u00a0al.: VideoCrafter1: open diffusion models for high-quality video generation. arXiv preprint arXiv:2310.19512 (2023)"},{"key":"19_CR9","unstructured":"Chen, T.S., Lin, C.H., Tseng, H.Y., Lin, T.Y., Yang, M.H.: Motion-conditioned diffusion model for controllable video synthesis. arXiv preprint arXiv:2304.14404 (2023)"},{"key":"19_CR10","unstructured":"Chen, W., et al.: Control-a-video: controllable text-to-video generation with diffusion models. arXiv preprint arXiv:2305.13840 (2023)"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Chen, X., Huang, L., Liu, Y., Shen, Y., Zhao, D., Zhao, H.: AnyDoor: zero-shot object-level image customization. arXiv preprint arXiv:2307.09481 (2023)","DOI":"10.1109\/CVPR52733.2024.00630"},{"key":"19_CR12","unstructured":"Dai, X., et\u00a0al.: EMU: enhancing image generation models using photogenic needles in a haystack. arXiv preprint arXiv:2309.15807 (2023)"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Esser, P., Chiu, J., Atighehchian, P., Granskog, J., Germanidis, A.: Structure and content-guided video synthesis with diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7346\u20137356 (2023)","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: EMU video: factorizing text-to-video generation by explicit image conditioning. arXiv preprint arXiv:2311.10709 (2023)","DOI":"10.1007\/978-3-031-73033-7_12"},{"key":"19_CR15","unstructured":"Gu, Y., et\u00a0al.: Mix-of-show: Decentralized low-rank adaptation for multi-concept customization of diffusion models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"19_CR16","doi-asserted-by":"crossref","unstructured":"Gu, Y., et al.: VideoSwap: customized video subject swapping with interactive semantic point correspondence. arXiv preprint arXiv:2312.02087 (2023)","DOI":"10.1109\/CVPR52733.2024.00728"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Guo, Y., Yang, C., Rao, A., Agrawala, M., Lin, D., Dai, B.: SparseCtrl: adding sparse controls to text-to-video diffusion models. arXiv preprint arXiv:2311.16933 (2023)","DOI":"10.1007\/978-3-031-72946-1_19"},{"key":"19_CR18","unstructured":"Guo, Y., et al.: AnimatEdiff: animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Hao, Z., Huang, X., Belongie, S.: Controllable video generation with sparse trajectories. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7854\u20137863 (2018)","DOI":"10.1109\/CVPR.2018.00819"},{"key":"19_CR20","unstructured":"Ho, J., et\u00a0al.: Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)"},{"key":"19_CR21","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR22","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. arXiv:2204.03458 (2022)"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Karaev, N., Rocco, I., Graham, B., Neverova, N., Vedaldi, A., Rupprecht, C.: CoTracker: it is better to track together. arXiv:2307.07635 (2023)","DOI":"10.1007\/978-3-031-73033-7_2"},{"key":"19_CR24","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"19_CR25","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"19_CR26","unstructured":"Ma, W.D.K., Lewis, J., Kleijn, W.B.: TrailBlazer: trajectory control for diffusion-based video generation. arXiv preprint arXiv:2401.00896 (2023)"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Ma, Y., et al.: Follow your pose: Pose-guided text-to-video generation using pose-free videos. arXiv preprint arXiv:2304.01186 (2023)","DOI":"10.1609\/aaai.v38i5.28206"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Miao, J., et al.: Large-scale video panoptic segmentation in the wild: a benchmark. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21033\u201321043 (2022)","DOI":"10.1109\/CVPR52688.2022.02036"},{"key":"19_CR29","unstructured":"Mou, C., Wang, X., Song, J., Shan, Y., Zhang, J.: DragonDiffusion: eDnabling drag-style manipulation on diffusion models. arXiv preprint arXiv:2307.02421 (2023)"},{"key":"19_CR30","unstructured":"Oquab, M., et\u00a0al.: DINOV2: earning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Pan, X., Tewari, A., Leimk\u00fchler, T., Liu, L., Meka, A., Theobalt, C.: Drag your GAN: interactive point-based manipulation on the generative image manifold. In: ACM SIGGRAPH 2023 Conference Proceedings, pp. 1\u201311 (2023)","DOI":"10.1145\/3588432.3591500"},{"key":"19_CR32","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"19_CR33","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: MICCAI (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"19_CR35","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR36","unstructured":"Seitzer, M.: PyTorch-fid: FID Score for PyTorch (2020). https:\/\/github.com\/mseitzer\/pytorch-fid"},{"key":"19_CR37","unstructured":"Tang, L., Jia, M., Wang, Q., Phoo, C.P., Hariharan, B.: Emergent correspondence from image diffusion. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"19_CR38","unstructured":"Tim, B., et al.: Video generation models as world simulators (2024)"},{"key":"19_CR39","unstructured":"Unterthiner, T., Van\u00a0Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: Towards accurate generative models of video: a new metric & challenges. arXiv preprint arXiv:1812.01717 (2018)"},{"key":"19_CR40","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, X., Cao, Y., Wang, W., Shen, C., Huang, T.: SegGPT: segmenting everything in context. arXiv preprint arXiv:2304.03284 (2023)","DOI":"10.1109\/ICCV51070.2023.00110"},{"key":"19_CR41","unstructured":"Wang, Y., et\u00a0al.: LAVIE: high-quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103 (2023)"},{"key":"19_CR42","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: MotionCtrl: a unified and flexible motion controller for video generation. arXiv preprint arXiv:2312.03641 (2023)","DOI":"10.1145\/3641519.3657518"},{"key":"19_CR43","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7623\u20137633 (2023)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"19_CR44","unstructured":"Wu, W., et al.: Paragraph-to-image generation with information-enriched diffusion model. arXiv preprint arXiv:2311.14284 (2023)"},{"key":"19_CR45","unstructured":"Wu, W., et al.: DatasetDm: synthesizing data with perception annotations using diffusion models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"19_CR46","unstructured":"Xing, Z., et al.: a survey on video diffusion models. arXiv preprint arXiv:2310.10647 (2023)"},{"key":"19_CR47","unstructured":"Xue, Z., et al.: RAPHAEL: text-to-image generation via large mixture of diffusion paths. arXiv preprint arXiv:2305.18295 (2023)"},{"key":"19_CR48","unstructured":"Yin, S., et al.: DRAGNUWA: fine-grained control in video generation by integrating text, image, and trajectory. arXiv preprint arXiv:2308.08089 (2023)"},{"key":"19_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, D.J., et al.: Show-1: marrying pixel and latent diffusion models for text-to-video generation. arXiv preprint arXiv:2309.15818 (2023)","DOI":"10.1007\/s11263-024-02271-9"},{"key":"19_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"19_CR51","unstructured":"Zhang, S., et al.: I2VGEN-XL: high-quality image-to-video synthesis via cascaded diffusion models. arXiv preprint arXiv:2311.04145 (2023)"},{"key":"19_CR52","unstructured":"Zhang, Y., Wei, Y., Jiang, D., Zhang, X., Zuo, W., Tian, Q.: ControlVideo: training-free controllable text-to-video generation. arXiv preprint arXiv:2305.13077 (2023)"},{"key":"19_CR53","unstructured":"Zhao, R., et al.: MotionDirector: motion customization of text-to-video diffusion models. arXiv preprint arXiv:2310.08465 (2023)"},{"key":"19_CR54","unstructured":"Zhou, D., Wang, W., Yan, H., Lv, W., Zhu, Y., Feng, J.: MagicVideo: efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018 (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72670-5_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:20:54Z","timestamp":1732828854000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72670-5_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726699","9783031726705"],"references-count":54,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72670-5_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}