{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T19:10:04Z","timestamp":1755976204861,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3688865.3689480","type":"proceedings-article","created":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T14:10:52Z","timestamp":1729433452000},"page":"7-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["XFashion: Character Animation Generation via Facial-enhanced and Granularly Controlling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8714-4151","authenticated-orcid":false,"given":"Yu","family":"Zhao","sequence":"first","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0513-5540","authenticated-orcid":false,"given":"Bobo","family":"Li","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3026-6347","authenticated-orcid":false,"given":"Hao","family":"Fei","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2016.7477553"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2929257"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2310.19512"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02496"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V34I03.5646"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.329"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00730"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong-Li Lee, and Wynne Hsu. 2024. Video-of-thought: Step-by-step video reasoning from perception to cognition. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_11_1","volume-title":"international conference on machine learning. PMLR, 6373--6391","author":"Fei Hao","year":"2022","unstructured":"Hao Fei, Shengqiong Wu, Yafeng Ren, and Meishan Zhang. 2022. Matching structure for dual learning. In international conference on machine learning. PMLR, 6373--6391."},{"key":"e_1_3_2_1_12_1","volume-title":"VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting, Editing.","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024. VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting, Editing. (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"2024 d. Enhancing video-language representations with structural spatio-temporal alignment","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Meishan Zhang, Min Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024 d. Enhancing video-language representations with structural spatio-temporal alignment. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3689171"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-demos.7"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6271"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657407"},{"key":"e_1_3_2_1_18_1","volume-title":"The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Guo Yuwei","year":"2024","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Zhengyang Liang, Yaohui Wang, Yu Qiao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2024. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. In The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7--11, 2024. OpenReview.net. https:\/\/openreview.net\/forum?id=Fx2SbBgcte"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2010.579"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2311.17117"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01256"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16258"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3183434"},{"volume-title":"Auto-Encoding Variational Bayes. In 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14--16, 2014, Conference Track Proceedings. http:\/\/arxiv.org\/abs\/1312","author":"Diederik","key":"e_1_3_2_1_24_1","unstructured":"Diederik P. Kingma and Max Welling. 2014. Auto-Encoding Variational Bayes. In 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14--16, 2014, Conference Track Proceedings. http:\/\/arxiv.org\/abs\/1312.6114"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612053"},{"key":"e_1_3_2_1_26_1","volume-title":"Fine-grained semantically aligned vision-language pre-training. Advances in neural information processing systems","author":"Li Juncheng","year":"2022","unstructured":"Juncheng Li, Xin He, Longhui Wei, Long Qian, Linchao Zhu, Lingxi Xie, Yueting Zhuang, Qi Tian, and Siliang Tang. 2022. Fine-grained semantically aligned vision-language pre-training. Advances in neural information processing systems, Vol. 35 (2022), 7290--7303."},{"key":"e_1_3_2_1_27_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Li Juncheng","year":"2023","unstructured":"Juncheng Li, Kaihang Pan, Zhiqi Ge, Minghe Gao, Hanwang Zhang, Wei Ji, Wenqiao Zhang, Tat-Seng Chua, Siliang Tang, and Yueting Zhuang. 2023. Fine-tuning multimodal llms to follow zero-shot demonstrative instructions. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3274139"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00304"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.semeval-1.226"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645677"},{"key":"e_1_3_2_1_33_1","volume-title":"Conditional Generative Adversarial Nets. CoRR","author":"Mirza Mehdi","year":"2014","unstructured":"Mehdi Mirza and Simon Osindero. 2014. Conditional Generative Adversarial Nets. CoRR, Vol. abs\/1411.1784 (2014). showeprint[arXiv]1411.1784 http:\/\/arxiv.org\/abs\/1411.1784"},{"key":"e_1_3_2_1_34_1","volume-title":"Momentor: Advancing video large language model with fine-grained temporal reasoning. arXiv preprint arXiv:2402.11435","author":"Qian Long","year":"2024","unstructured":"Long Qian, Juncheng Li, Yu Wu, Yaobo Ye, Hao Fei, Tat-Seng Chua, Yueting Zhuang, and Siliang Tang. 2024. Momentor: Advancing video large language model with fine-grained temporal reasoning. arXiv preprint arXiv:2402.11435 (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3--319--24574--4_28"},{"key":"e_1_3_2_1_37_1","volume-title":"First Order Motion Model for Image Animation. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019","author":"Siarohin Aliaksandr","year":"2019","unstructured":"Aliaksandr Siarohin, St\u00e9phane Lathuili\u00e8re, Sergey Tulyakov, Elisa Ricci, and Nicu Sebe. 2019. First Order Motion Model for Image Animation. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8--14, 2019, Vancouver, BC, Canada. 7135--7145."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01344"},{"key":"e_1_3_2_1_39_1","volume-title":"Towards Accurate Generative Models of Video: A New Metric & Challenges. CoRR","author":"Unterthiner Thomas","year":"2018","unstructured":"Thomas Unterthiner, Sjoerd van Steenkiste, Karol Kurach, Rapha\u00ebl Marinier, Marcin Michalski, and Sylvain Gelly. 2018. Towards Accurate Generative Models of Video: A New Metric & Challenges. CoRR, Vol. abs\/1812.01717 (2018). showeprint[arXiv]1812.01717 http:\/\/arxiv.org\/abs\/1812.01717"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2307.00040"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_1_43_1","volume-title":"ControlMLLM: Training-Free Visual Prompt Learning for Multimodal Large Language Models. arXiv preprint arXiv:2407.21534","author":"Wu Mingrui","year":"2024","unstructured":"Mingrui Wu, Xinyue Cai, Jiayi Ji, Jiale Li, Oucheng Huang, Gen Luo, Hao Fei, Xiaoshuai Sun, and Rongrong Ji. 2024. ControlMLLM: Training-Free Visual Prompt Learning for Multimodal Large Language Models. arXiv preprint arXiv:2407.21534 (2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.823"},{"key":"e_1_3_2_1_45_1","volume-title":"Towards Semantic Equivalence of Tokenization in Multimodal LLM. arXiv preprint arXiv:2406.05127","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Xiangtai Li, Jiayi Ji, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024. Towards Semantic Equivalence of Tokenization in Multimodal LLM. arXiv preprint arXiv:2406.05127 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the International Conference on Machine Learning.","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2024. NExT-GPT: Any-to-Any Multimodal LLM. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems. 79240--79259","author":"Wu Shengqiong","year":"2023","unstructured":"Shengqiong Wu, Hao Fei, Hanwang Zhang, and Tat-Seng Chua. 2023. Imagine that! abstract-to-intricate text-to-image synthesis with scene graph hallucination diffusion. In Proceedings of the 37th International Conference on Neural Information Processing Systems. 79240--79259."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2311.16498"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01479"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00690"},{"key":"e_1_3_2_1_51_1","volume-title":"30th British Machine Vision Conference 2019, BMVC 2019","author":"Zablotskaia Polina","year":"2019","unstructured":"Polina Zablotskaia, Aliaksandr Siarohin, Bo Zhao, and Leonid Sigal. 2019. DwNet: Dense warp-based network for pose-guided human video generation. In 30th British Machine Vision Conference 2019, BMVC 2019, Cardiff, UK, September 9--12, 2019. BMVA Press, 51. https:\/\/bmvc2019.org\/wp-content\/uploads\/papers\/1039-paper.pdf"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00756"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_54_1","volume-title":"Chen Change Loy, and Shuicheng Yan","author":"Zhang Tao","year":"2024","unstructured":"Tao Zhang, Xiangtai Li, Hao Fei, Haobo Yuan, Shengqiong Wu, Shunping Ji, Chen Change Loy, and Shuicheng Yan. 2024. Omg-llava: Bridging image-level, object-level, pixel-level reasoning and understanding. arXiv preprint arXiv:2406.19389 (2024)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612096"},{"key":"e_1_3_2_1_57_1","volume-title":"Generating visual spatial description via holistic 3D scene understanding. arXiv preprint arXiv:2305.11768","author":"Zhao Yu","year":"2023","unstructured":"Yu Zhao, Hao Fei, Wei Ji, Jianguo Wei, Meishan Zhang, Min Zhang, and Tat-Seng Chua. 2023. Generating visual spatial description via holistic 3D scene understanding. arXiv preprint arXiv:2305.11768 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 5th International Workshop on Human-centric Multimedia Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3688865.3689480","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3688865.3689480","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:38:55Z","timestamp":1755974335000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3688865.3689480"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":57,"alternative-id":["10.1145\/3688865.3689480","10.1145\/3688865"],"URL":"https:\/\/doi.org\/10.1145\/3688865.3689480","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}