{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T22:46:03Z","timestamp":1775861163078,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Hubei Provincial Natural Science Foundation of China","award":["No.2022CFA055"],"award-info":[{"award-number":["No.2022CFA055"]}]},{"name":"National Natural Science Foundation of China","award":["No.62176097"],"award-info":[{"award-number":["No.62176097"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680836","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"3199-3208","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Tunnel Try-on: Excavating Spatial-temporal Tunnels for High-quality Virtual Try-on in Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2482-8334","authenticated-orcid":false,"given":"Zhengze","family":"Xu","sequence":"first","affiliation":[{"name":"School of AIA, Huazhong University of Science and Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7498-5761","authenticated-orcid":false,"given":"Mengting","family":"Chen","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6104-6133","authenticated-orcid":false,"given":"Zhao","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4727-1017","authenticated-orcid":false,"given":"Linyu","family":"Xing","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4878-1892","authenticated-orcid":false,"given":"Zhonghua","family":"Zhai","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9167-1496","authenticated-orcid":false,"given":"Nong","family":"Sang","sequence":"additional","affiliation":[{"name":"State Key Lab of MIIPT, Huazhong University of Science and Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6890-4960","authenticated-orcid":false,"given":"Jinsong","family":"Lan","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8550-5064","authenticated-orcid":false,"given":"Shuai","family":"Xiao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2736-3920","authenticated-orcid":false,"given":"Changxin","family":"Gao","sequence":"additional","affiliation":[{"name":"State Key Lab of MIIPT, Huazhong University of Science and Technology, Wuhan, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02138"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021"},{"key":"e_1_3_2_1_4_1","volume-title":"Wear-Any-Way: Manipulable Virtual Try-on via Sparse Correspondence Alignment. arXiv preprint arXiv:2403.12965","author":"Chen Mengting","year":"2024","unstructured":"Mengting Chen, Xi Chen, Zhonghua Zhai, Chen Ju, Xuewen Hong, Jinsong Lan, and Shuai Xiao. 2024. Wear-Any-Way: Manipulable Virtual Try-on via Sparse Correspondence Alignment. arXiv preprint arXiv:2403.12965 (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"Anydoor: Zero-shot object-level image customization. arXiv preprint arXiv:2307.09481","author":"Chen Xi","year":"2023","unstructured":"Xi Chen, Lianghua Huang, Yu Liu, Yujun Shen, Deli Zhao, and Hengshuang Zhao. 2023. Anydoor: Zero-shot object-level image customization. arXiv preprint arXiv:2307.09481 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"LivePhoto: Real Image Animation with Text-guided Motion Control. arXiv preprint arXiv:2312.02928","author":"Chen Xi","year":"2023","unstructured":"Xi Chen, Zhiheng Liu, Mengting Chen, Yutong Feng, Yu Liu, Yujun Shen, and Hengshuang Zhao. 2023. LivePhoto: Real Image Animation with Text-guided Motion Control. arXiv preprint arXiv:2312.02928 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01391"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00125"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00814"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00838"},{"key":"e_1_3_2_1_11_1","volume-title":"Taming the Power of Diffusion Models for High-Quality Virtual Try-On with Appearance Flow. arXiv preprint arXiv:2308.06101","author":"Gou Junhong","year":"2023","unstructured":"Junhong Gou, Siyu Sun, Jianfu Zhang, Jianlou Si, Chen Qian, and Liqing Zhang. 2023. Taming the Power of Diffusion Models for High-Quality Virtual Try-On with Appearance Flow. arXiv preprint arXiv:2308.06101 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Learning Representations","author":"Guo Yuwei","year":"2024","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Zhengyang Liang, Yaohui Wang, Yu Qiao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2024. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. International Conference on Learning Representations (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00787"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"e_1_3_2_1_15_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_16_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems 33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_17_1","unstructured":"Jonathan Ho Ajay Jain and P. Abbeel. 2020. Denoising Diffusion Probabilistic Models. ArXiv abs\/2006.11239 (2020). https:\/\/api.semanticscholar.org\/CorpusID: 219955663"},{"key":"e_1_3_2_1_18_1","volume-title":"Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. ArXiv abs\/2311.17117","author":"Hu Liucheng","year":"2023","unstructured":"Liucheng Hu, Xin Gao, Peng Zhang, Ke Sun, Bang Zhang, and Liefeng Bo. 2023. Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. ArXiv abs\/2311.17117 (2023). https:\/\/api.semanticscholar. org\/CorpusID:265499043"},{"key":"e_1_3_2_1_19_1","volume-title":"Make It Move: Controllable Image-to-Video Generation with Text Descriptions. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","author":"Hu Yaosi","year":"2021","unstructured":"Yaosi Hu, Chong Luo, and Zhenzhong Chen. 2021. Make It Move: Controllable Image-to-Video Generation with Text Descriptions. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 18198--18207. https: \/\/api.semanticscholar.org\/CorpusID:244908665"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings, Part XX 16","author":"Issenhuth Thibaut","year":"2020","unstructured":"Thibaut Issenhuth, J\u00e9r\u00e9mie Mary, and Cl\u00e9ment Calauzenes. 2020. Do not mask what you do not need to mask: a parser-free virtual try-on. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XX 16. Springer, 619--635."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01053"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02073"},{"key":"e_1_3_2_1_23_1","volume-title":"StableVITON: Learning Semantic Correspondence with Latent Diffusion Model for Virtual Try-On. arXiv preprint arXiv:2312.01725","author":"Kim Jeongho","year":"2023","unstructured":"Jeongho Kim, Gyojung Gu, Minho Park, Sunghyun Park, and Jaegul Choo. 2023. StableVITON: Learning Semantic Correspondence with Latent Diffusion Model for Virtual Try-On. arXiv preprint arXiv:2312.01725 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW52041.2021.00025"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_13"},{"key":"e_1_3_2_1_27_1","volume-title":"Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784","author":"Mirza Mehdi","year":"2014","unstructured":"Mehdi Mirza and Simon Osindero. 2014. Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784 (2014)."},{"key":"e_1_3_2_1_28_1","volume-title":"LaDI-VTON: Latent Diffusion Textual-Inversion Enhanced Virtual Try-On. arXiv preprint arXiv:2305.13501","author":"Morelli Davide","year":"2023","unstructured":"Davide Morelli, Alberto Baldrati, Giuseppe Cartella, Marcella Cornia, Marco Bertini, and Rita Cucchiara. 2023. LaDI-VTON: Latent Diffusion Textual-Inversion Enhanced Virtual Try-On. arXiv preprint arXiv:2305.13501 (2023)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00243"},{"key":"e_1_3_2_1_30_1","volume-title":"Conditional Image-to-Video Generation with Latent Flow Diffusion Models. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","author":"Ni Haomiao","year":"2023","unstructured":"Haomiao Ni, Changhao Shi, Kaican Li, Sharon X. Huang, and Martin Renqiang Min. 2023. Conditional Image-to-Video Generation with Latent Flow Diffusion Models. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023), 18444--18455. https:\/\/api.semanticscholar.org\/CorpusID: 257757116"},{"key":"e_1_3_2_1_31_1","volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_34_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation. In Medical image computing and computer-assisted intervention-MICCAI 2015: 18th international conference","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In Medical image computing and computer-assisted intervention-MICCAI 2015: 18th international conference, Munich, Germany, October 5--9, 2015, proceedings, part III 18. Springer, 234--241."},{"key":"e_1_3_2_1_35_1","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems 35 (2022), 25278--25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28288"},{"key":"e_1_3_2_1_37_1","volume-title":"International conference on machine learning. PMLR, 2256--2265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International conference on machine learning. PMLR, 2256--2265."},{"key":"e_1_3_2_1_38_1","volume-title":"EDGE: Editable Dance Generation From Music. arXiv preprint arXiv:2211.10658","author":"Tseng Jonathan","year":"2022","unstructured":"Jonathan Tseng, Rodrigo Castellon, and C Karen Liu. 2022. EDGE: Editable Dance Generation From Music. arXiv preprint arXiv:2211.10658 (2022)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_36"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00891"},{"key":"e_1_3_2_1_41_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing 13, 4 (2004), 600--612."},{"key":"e_1_3_2_1_42_1","unstructured":"Greg Welch Gary Bishop et al. 1995. An introduction to the Kalman filter. (1995)."},{"key":"e_1_3_2_1_43_1","volume-title":"Hanshu Yan, Jia-Wei Liu, Chenxu Zhang, Jiashi Feng, and Mike Zheng Shou.","author":"Xu Zhongcong","year":"2023","unstructured":"Zhongcong Xu, Jianfeng Zhang, Jun Hao Liew, Hanshu Yan, Jia-Wei Liu, Chenxu Zhang, Jiashi Feng, and Mike Zheng Shou. 2023. MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model. ArXiv abs\/2311.16498 (2023). https:\/\/api.semanticscholar.org\/CorpusID:265466012"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01763"},{"key":"e_1_3_2_1_45_1","volume-title":"MagicAvatar: Multimodal Avatar Generation and Animation. ArXiv abs\/2308.14748","author":"Zhang Jianfeng","year":"2023","unstructured":"Jianfeng Zhang, Hanshu Yan, Zhongcong Xu, Jiashi Feng, and Jun Hao Liew. 2023. MagicAvatar: Multimodal Avatar Generation and Animation. ArXiv abs\/2308.14748 (2023). https:\/\/api.semanticscholar.org\/CorpusID:261276847"},{"key":"e_1_3_2_1_46_1","unstructured":"Lvmin Zhang. 2023.5. Reference-only controlnet. https:\/\/github.com\/Mikubill\/sdwebui- controlnet\/discussions\/1236."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00756"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00364"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475269"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00447"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680836","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680836","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:08Z","timestamp":1750295888000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680836"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":51,"alternative-id":["10.1145\/3664647.3680836","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680836","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}