{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,23]],"date-time":"2025-10-23T01:09:23Z","timestamp":1761181763735,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China under Grant","award":["62202063"],"award-info":[{"award-number":["62202063"]}]},{"name":"Beijing Natural Science Foundation","award":["L243027"],"award-info":[{"award-number":["L243027"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746270.3760235","type":"proceedings-article","created":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T15:14:09Z","timestamp":1760973249000},"page":"128-136","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SketchDancing: A Text-Driven Framework for Vector Sketch Animation Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3905-2062","authenticated-orcid":false,"given":"Xianlin","family":"Zhang","sequence":"first","affiliation":[{"name":"School of Digital Media and Design Arts, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2824-3023","authenticated-orcid":false,"given":"Zhuoyun","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Digital Media and Design Arts, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0688-9633","authenticated-orcid":false,"given":"Jin","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science (National Demonstrative School of Software), Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1058-2799","authenticated-orcid":false,"given":"Xueming","family":"Li","sequence":"additional","affiliation":[{"name":"School of Digital Media and Design Arts, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6955-6635","authenticated-orcid":false,"given":"Mengshi","family":"Qi","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,26]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Hmrishav Bandyopadhyay and Yi-Zhe Song. 2024. FlipSketch: Flipping Static Drawings to Text-Guided Sketch Animations. arXiv:2411.10818 [cs.GR] https:\/\/arxiv.org\/abs\/2411.10818"},{"key":"e_1_3_2_1_2_1","unstructured":"Haoxin Chen Menghan Xia Yingqing He Yong Zhang Xiaodong Cun Shaoshu Yang Jinbo Xing Yaofang Liu Qifeng Chen Xintao Wang Chao Weng and Ying Shan. 2023. VideoCrafter1: Open Diffusion Models for High-Quality Video Generation. arXiv:2310.19512 [cs.CV] https:\/\/arxiv.org\/abs\/2310.19512"},{"key":"e_1_3_2_1_3_1","unstructured":"Yajing Chen Shikui Tu Yuqi Yi and Lei Xu. 2017. Sketch-pix2seq: a Model to Generate Sketches of Multiple Categories. arXiv:1709.04121 [cs.CV] https:\/\/arxiv.org\/abs\/1709.04121"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Katherine Crowson Stella Biderman Daniel Kornis Dashiell Stander Eric Hallahan Louis Castricato and Edward Raff. 2022. VQGAN-CLIP: Open Domain Image Generation and Editing with Natural Language Guidance. arXiv:2204.08583 [cs.CV] https:\/\/arxiv.org\/abs\/2204.08583","DOI":"10.1007\/978-3-031-19836-6_6"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1185657.1185776"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00839"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Rinon Gal Yael Vinker Yuval Alaluf Amit H. Bermano Daniel Cohen-Or Ariel Shamir and Gal Chechik. 2023. Breathing Life Into Sketches Using Text-to-Video Priors. arXiv:2311.13608 [cs.CV] https:\/\/arxiv.org\/abs\/2311.13608","DOI":"10.1109\/CVPR52733.2024.00414"},{"key":"e_1_3_2_1_8_1","unstructured":"David Ha and Douglas Eck. 2017. A Neural Representation of Sketch Drawings. arXiv:1704.03477 [cs.NE] https:\/\/arxiv.org\/abs\/1704.03477"},{"key":"e_1_3_2_1_9_1","volume-title":"Imagen Video: High Definition Video Generation with Diffusion Models. arXiv:2210.02303 [cs.CV] https:\/\/arxiv.org\/abs\/2210.02303","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, William Chan, Chitwan Saharia, Jay Whang, Ruiqi Gao, Alexey Gritsenko, Diederik P. Kingma, Ben Poole, Mohammad Norouzi, David J. Fleet, and Tim Salimans. 2022. Imagen Video: High Definition Video Generation with Diffusion Models. arXiv:2210.02303 [cs.CV] https:\/\/arxiv.org\/abs\/2210.02303"},{"key":"e_1_3_2_1_10_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. arXiv:2006.11239 [cs.LG] https:\/\/arxiv.org\/abs\/2006.11239"},{"key":"e_1_3_2_1_11_1","unstructured":"Doyeon Kim Donggyu Joo and Junmo Kim. 2021. TiVGAN: Text to Image to Video Generation with Step-by-Step Evolutionary Generator. arXiv:2009.02018 [cs.CV] https:\/\/arxiv.org\/abs\/2009.02018"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417871"},{"key":"e_1_3_2_1_13_1","unstructured":"Hangyu Lin Yanwei Fu Yu-Gang Jiang and Xiangyang Xue. 2020. Sketch-BERT: Learning Sketch Bidirectional Encoder Representation from Transformers by Self-supervised Learning of Sketch Gestalt. arXiv:2005.09159 [cs.CV] https:\/\/arxiv.org\/abs\/2005.09159"},{"key":"e_1_3_2_1_14_1","volume-title":"Sora: A Review on Background, Technology, Limitations, and Opportunities of Large Vision Models. arXiv:2402.17177 [cs.CV] https:\/\/arxiv.org\/abs\/2402.17177","author":"Liu Yixin","year":"2024","unstructured":"Yixin Liu, Kai Zhang, Yuan Li, Zhiling Yan, Chujie Gao, Ruoxi Chen, Zhengqing Yuan, Yue Huang, Hanchi Sun, Jianfeng Gao, Lifang He, and Lichao Sun. 2024. Sora: A Review on Background, Technology, Limitations, and Opportunities of Large Vision Models. arXiv:2402.17177 [cs.CV] https:\/\/arxiv.org\/abs\/2402.17177"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28197"},{"key":"e_1_3_2_1_16_1","unstructured":"Yiwei Ma Guohai Xu Xiaoshuai Sun Ming Yan Ji Zhang and Rongrong Ji. 2022. X-CLIP: End-to-End Multi-grained Contrastive Learning for Video-Text Retrieval. arXiv:2207.07285 [cs.CV] https:\/\/arxiv.org\/abs\/2207.07285"},{"key":"e_1_3_2_1_17_1","volume-title":"Unsupervised Doodling and Painting with Improved SPIRAL. CoRR abs\/1910.01007","author":"Mellor John F. J.","year":"2019","unstructured":"John F. J. Mellor, Eunbyung Park, Yaroslav Ganin, Igor Babuschkin, Tejas Kulkarni, Dan Rosenbaum, Andy Ballard, Theophane Weber, Oriol Vinyals, and S. M. Ali Eslami. 2019. Unsupervised Doodling and Painting with Improved SPIRAL. CoRR abs\/1910.01007 (2019). arXiv:1910.01007 http:\/\/arxiv.org\/abs\/1910.01007"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3--642--22639--7_17"},{"key":"e_1_3_2_1_19_1","unstructured":"Ben Poole Ajay Jain Jonathan T. Barron and Ben Mildenhall. 2022. DreamFusion: Text-to-3D using 2D Diffusion. arXiv:2209.14988 [cs.CV] https:\/\/arxiv.org\/abs\/2209.14988"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00408"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_7"},{"key":"e_1_3_2_1_22_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020 [cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_1_23_1","unstructured":"Aditya Ramesh Mikhail Pavlov Gabriel Goh Scott Gray Chelsea Voss Alec Radford Mark Chen and Ilya Sutskever. 2021. Zero-Shot Text-to-Image Generation. arXiv:2102.12092 [cs.CV] https:\/\/arxiv.org\/abs\/2102.12092"},{"key":"e_1_3_2_1_24_1","volume-title":"Sketchformer: Transformer-based Representation for Sketched Structure. arXiv:2002.10381 [cs.CV] https:\/\/arxiv.org\/abs\/2002.10381","author":"Ferraz Ribeiro Leo Sampaio","year":"2020","unstructured":"Leo Sampaio Ferraz Ribeiro, Tu Bui, John Collomosse, and Moacir Ponti. 2020. Sketchformer: Transformer-based Representation for Sketched Structure. arXiv:2002.10381 [cs.CV] https:\/\/arxiv.org\/abs\/2002.10381"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bjorn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. arXiv:2112.10752 [cs.CV] https:\/\/arxiv.org\/abs\/2112.10752","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_26_1","unstructured":"Uriel Singer Adam Polyak Thomas Hayes Xi Yin Jie An Songyang Zhang Qiyuan Hu Harry Yang Oron Ashual Oran Gafni Devi Parikh Sonal Gupta and Yaniv Taigman. 2022. Make-A-Video: Text-to-Video Generation without Text-Video Data. arXiv:2209.14792 [cs.CV] https:\/\/arxiv.org\/abs\/2209.14792"},{"key":"e_1_3_2_1_27_1","volume-title":"Hodgins","author":"Smith Harrison Jesse","year":"2023","unstructured":"Harrison Jesse Smith, Qingyuan Zheng, Yifei Li, Somya Jain, and Jessica K. Hodgins. 2023. A Method for Animating Children's Drawings of the Human Figure. arXiv:2303.12741 [cs.CV] https:\/\/arxiv.org\/abs\/2303.12741"},{"key":"e_1_3_2_1_28_1","volume-title":"Hodgins","author":"Smith Harrison Jesse","year":"2023","unstructured":"Harrison Jesse Smith, Qingyuan Zheng, Yifei Li, Somya Jain, and Jessica K. Hodgins. 2023. A Method for Animating Children's Drawings of the Human Figure. arXiv:2303.12741 [cs.CV] https:\/\/arxiv.org\/abs\/2303.12741"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3174236"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Sergey Tulyakov Ming-Yu Liu Xiaodong Yang and Jan Kautz. 2017. MoCoGAN: Decomposing Motion and Content for Video Generation. arXiv:1707.04993 [cs.CV] https:\/\/arxiv.org\/abs\/1707.04993","DOI":"10.1109\/CVPR.2018.00165"},{"key":"e_1_3_2_1_31_1","unstructured":"Varshaneya V S Balasubramanian and Vineeth N Balasubramanian. 2019. Teaching GANs to Sketch in Vector Format. arXiv:1904.03620 [cs.GR] https:\/\/arxiv.org\/abs\/1904.03620"},{"key":"e_1_3_2_1_32_1","unstructured":"Varshaneya V S Balasubramanian and Vineeth N Balasubramanian. 2019. Teaching GANs to Sketch in Vector Format. arXiv:1904.03620 [cs.GR] https:\/\/arxiv.org\/abs\/1904.03620"},{"key":"e_1_3_2_1_33_1","volume-title":"Santiago Castro, Julius Kunze, and Dumitru Erhan.","author":"Villegas Ruben","year":"2022","unstructured":"Ruben Villegas, Mohammad Babaeizadeh, Pieter-Jan Kindermans, Hernan Moraldo, Han Zhang, Mohammad Taghi Saffar, Santiago Castro, Julius Kunze, and Dumitru Erhan. 2022. Phenaki: Variable Length Video Generation From Open Domain Textual Description. arXiv:2210.02399 [cs.CV] https:\/\/arxiv.org\/abs\/2210.02399"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Yael Vinker Yuval Alaluf Daniel Cohen-Or and Ariel Shamir. 2023. CLIPascene: Scene Sketching with Different Types and Levels of Abstraction. arXiv:2211.17256 [cs.CV] https:\/\/arxiv.org\/abs\/2211.17256","DOI":"10.1109\/ICCV51070.2023.00383"},{"key":"e_1_3_2_1_35_1","volume-title":"Amit Haim Bermano, Daniel Cohen-Or, Amir Zamir, and Ariel Shamir.","author":"Vinker Yael","year":"2022","unstructured":"Yael Vinker, Ehsan Pajouheshgar, Jessica Y. Bo, Roman Christian Bachmann, Amit Haim Bermano, Daniel Cohen-Or, Amir Zamir, and Ariel Shamir. 2022. CLIPasso: Semantically-Aware Object Sketching. arXiv:2202.05822 [cs.GR] https:\/\/arxiv.org\/abs\/2202.05822"},{"key":"e_1_3_2_1_36_1","unstructured":"Jiuniu Wang Hangjie Yuan Dayou Chen Yingya Zhang Xiang Wang and Shiwei Zhang. 2023. ModelScope Text-to-Video Technical Report. arXiv:2308.06571 [cs.CV] https:\/\/arxiv.org\/abs\/2308.06571"},{"key":"e_1_3_2_1_37_1","volume-title":"SketchKnitter: Vectorized Sketch Generation with Diffusion Models. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=4eJ43EN2g6l","author":"Wang Qiang","year":"2023","unstructured":"Qiang Wang, Haoge Deng, Yonggang Qi, Da Li, and Yi-Zhe Song. 2023. SketchKnitter: Vectorized Sketch Generation with Diffusion Models. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=4eJ43EN2g6l"},{"key":"e_1_3_2_1_38_1","volume-title":"GODIVA: Generating Open-DomaIn Videos from nAtural Descriptions. arXiv:2104.14806 [cs.CV] https:\/\/arxiv.org\/abs\/2104.14806","author":"Wu Chenfei","year":"2021","unstructured":"Chenfei Wu, Lun Huang, Qianxi Zhang, Binyang Li, Lei Ji, Fan Yang, Guillermo Sapiro, and Nan Duan. 2021. GODIVA: Generating Open-DomaIn Videos from nAtural Descriptions. arXiv:2104.14806 [cs.CV] https:\/\/arxiv.org\/abs\/2104.14806"},{"key":"e_1_3_2_1_39_1","unstructured":"Chenfei Wu Jian Liang Lei Ji Fan Yang Yuejian Fang Daxin Jiang and Nan Duan. 2021. N\u00dcWA: Visual Synthesis Pre-training for Neural visUal World creAtion. arXiv:2111.12417 [cs.CV] https:\/\/arxiv.org\/abs\/2111.12417"},{"key":"e_1_3_2_1_40_1","unstructured":"Jay Zhangjie Wu Yixiao Ge Xintao Wang Weixian Lei Yuchao Gu Yufei Shi Wynne Hsu Ying Shan Xiaohu Qie and Mike Zheng Shou. 2023. Tune-AVideo: One-Shot Tuning of Image Diffusion Models for Text-to-Video Generation. arXiv:2212.11565 [cs.CV] https:\/\/arxiv.org\/abs\/2212.11565"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/2816795.2818079"},{"key":"e_1_3_2_1_42_1","volume-title":"European Conference on Computer Vision. Springer, 447--463","author":"Zhao Zhe","year":"2024","unstructured":"Zhe Zhao, Mengshi Qi, and Huadong Ma. 2024. Decomposed vector-quantized variational autoencoder for human grasp generation. In European Conference on Computer Vision. Springer, 447--463."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2023.3251095"}],"event":{"name":"MM '25:The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland"},"container-title":["Proceedings of the 3rd International Workshop on Multimodal and Responsible Affective Computing"],"original-title":[],"deposited":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T17:23:22Z","timestamp":1761153802000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746270.3760235"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,26]]},"references-count":43,"alternative-id":["10.1145\/3746270.3760235","10.1145\/3746270"],"URL":"https:\/\/doi.org\/10.1145\/3746270.3760235","relation":{},"subject":[],"published":{"date-parts":[[2025,10,26]]},"assertion":[{"value":"2025-10-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}