{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:00Z","timestamp":1765343040258,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China","award":["62311530100, 62171251"],"award-info":[{"award-number":["62311530100, 62171251"]}]},{"name":"the Special Foundations for the Development of Strategic Emerging Industries of Shenzhen","award":["KJZD20231023094700001"],"award-info":[{"award-number":["KJZD20231023094700001"]}]},{"name":"the Major Key Research Project of PCL","award":["PCL2023A08"],"award-info":[{"award-number":["PCL2023A08"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754776","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"2811-2820","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhanced Motion-aware Latent Diffusion Models for Video Frame Interpolation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3417-743X","authenticated-orcid":false,"given":"Zhilin","family":"Huang","sequence":"first","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China and Pengcheng Laboratory, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8223-873X","authenticated-orcid":false,"given":"Chujun","family":"Qin","sequence":"additional","affiliation":[{"name":"China Southern Power Grid Co., Ltd., Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5206-9515","authenticated-orcid":false,"given":"Yifei","family":"Xing","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Shenzhen, China and Pengcheng Laboratory, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2506-1286","authenticated-orcid":false,"given":"Wenming","family":"Yang","sequence":"additional","affiliation":[{"name":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China and Pengcheng Laboratory, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-010-0390-2"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_2_1_3_1","volume-title":"Classifier-free guidance is a predictor-corrector. arXiv preprint arXiv:2408.09000","author":"Bradley Arwen","year":"2024","unstructured":"Arwen Bradley and Preetum Nakkiran. 2024. Classifier-free guidance is a predictor-corrector. arXiv preprint arXiv:2408.09000 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops. 0-0.","author":"Chaney Kenneth","year":"2019","unstructured":"Kenneth Chaney, Alex Zihao Zhu, and Kostas Daniilidis. 2019. Learning event-based height from plane and parallax. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops. 0-0."},{"key":"e_1_3_2_1_5_1","volume-title":"Pixart-alpha: Fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426","author":"Chen Junsong","year":"2023","unstructured":"Junsong Chen, Jincheng Yu, Chongjian Ge, Lewei Yao, Enze Xie, Yue Wu, Zhongdao Wang, James Kwok, Ping Luo, Huchuan Lu, et al., 2023. Pixart-alpha: Fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio.","author":"Cho Kyunghyun","year":"2014","unstructured":"Kyunghyun Cho, Bart Van Merri\u00ebnboer, Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6693"},{"key":"e_1_3_2_1_8_1","volume-title":"FloLPIPS: A Bespoke Video Quality Metric for Frame Interpolation. In IEEE Picture Coding Symposium. 283-287","author":"Danier Duolikun","year":"2022","unstructured":"Duolikun Danier, Fan Zhang, and David Bull. 2022a. FloLPIPS: A Bespoke Video Quality Metric for Frame Interpolation. In IEEE Picture Coding Symposium. 283-287."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00351"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897364"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27912"},{"volume-title":"Diffusion models beat gans on image synthesis","author":"Dhariwal Prafulla","key":"e_1_3_2_1_12_1","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis, Vol. 34. 8780-8794."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00791"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_16_1","volume-title":"Adv. in Neural Inform. Process. Syst.","volume":"30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. GANs trained by a two time-scale update rule converge to a local nash equilibrium. Adv. in Neural Inform. Process. Syst., Vol. 30 (2017)."},{"key":"e_1_3_2_1_17_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Adv. in Neural Inform. Process. Syst., Vol. 33 (2020), 6840-6851.","journal-title":"Adv. in Neural Inform. Process. Syst."},{"key":"e_1_3_2_1_18_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_19_1","volume-title":"Fleet","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, Tim Salimans, Alexey A. Gritsenko, William Chan, Mohammad Norouzi, and David J. Fleet. 2022. Video Diffusion Models. In Adv. in Neural Inform. Process. Syst., Alice H. Oh, Alekh Agarwal, Danielle Belgrave, and Kyunghyun Cho (Eds.). https:\/\/openreview.net\/forum?id=f3zNgKga_ep"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00012"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i11.29162"},{"key":"e_1_3_2_1_22_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Huang Zhilin","year":"2024","unstructured":"Zhilin Huang, Ling Yang, Xiangxin Zhou, Zhilong Zhang, Wentao Zhang, Xiawu Zheng, Jie Chen, Yu Wang, Bin Cui, and Wenming Yang. 2024c. Protein-ligand interaction prior for binding-aware 3d molecule diffusion models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680846"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00936"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/1046920.1088696"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"e_1_3_2_1_27_1","volume-title":"3D convolutional neural networks for human action recognition","author":"Ji Shuiwang","year":"2012","unstructured":"Shuiwang Ji, Wei Xu, Ming Yang, and Kai Yu. 2012. 3D convolutional neural networks for human action recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 35, 1 (2012), 221-231."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00938"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00211"},{"key":"e_1_3_2_1_30_1","volume-title":"NTIRE 2023 video colorization challenge. In Proc. of the IEEE Conf. on Comput. Vis. and Pattern Recog., 1570-1581","author":"Kang Xiaoyang","year":"2023","unstructured":"Xiaoyang Kang, Xianhui Lin, Kai Zhang, Zheng Hui, Wangmeng Xiang, Jun-Yan He, Xiaoming Li, Peiran Ren, Xuansong Xie, Radu Timofte, et al., 2023. NTIRE 2023 video colorization challenge. In Proc. of the IEEE Conf. on Comput. Vis. and Pattern Recog., 1570-1581."},{"key":"e_1_3_2_1_31_1","first-page":"2426","article-title":"DiffusionCLIP","author":"Kim Gwanghyun","year":"2022","unstructured":"Gwanghyun Kim, Taesung Kwon, and Jong Chul Ye. 2022. DiffusionCLIP: Text-Guided Diffusion Models for Robust Image Manipulation. 2426-2435.","journal-title":"Text-Guided Diffusion Models for Robust Image Manipulation."},{"volume-title":"Adam: A Method for Stochastic Optimization. In Int. Conf. on Learn. Represent.","author":"Diederik","key":"e_1_3_2_1_32_1","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In Int. Conf. on Learn. Represent."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00201"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00536"},{"key":"e_1_3_2_1_35_1","volume-title":"Efficient event stream super-resolution with recursive multi-branch fusion. arXiv preprint arXiv:2406.19640","author":"Liang Quanmin","year":"2024","unstructured":"Quanmin Liang, Zhilin Huang, Xiawu Zheng, Feidiao Yang, Jun Peng, Kai Huang, and Yonghong Tian. 2024. Efficient event stream super-resolution with recursive multi-branch fusion. arXiv preprint arXiv:2406.19640 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Decoupled Weight Decay Regularization. In Int. Conf. on Learn. Represent., https:\/\/openreview.net\/forum?id=Bkg6RiCqY7","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In Int. Conf. on Learn. Represent., https:\/\/openreview.net\/forum?id=Bkg6RiCqY7"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00352"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3108943"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00548"},{"key":"e_1_3_2_1_40_1","volume-title":"BMBC: Bilateral motion estimation with bilateral cost","author":"Park Junheum","year":"2020","unstructured":"Junheum Park, Keunsoo Ko, Chul Lee, and Chang-Su Kim. 2020. BMBC: Bilateral motion estimation with bilateral cost volume for video interpolation. In Comput. Vis.-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XIV 16. Springer, 109-125."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01427"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"e_1_3_2_1_43_1","volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00398"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01422"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00652"},{"key":"e_1_3_2_1_49_1","volume-title":"Denoising Diffusion Implicit Models. In Int. Conf. on Learn. Represent.","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021a. Denoising Diffusion Implicit Models. In Int. Conf. on Learn. Represent."},{"volume-title":"Generative modeling by estimating gradients of the data distribution","author":"Song Yang","key":"e_1_3_2_1_50_1","unstructured":"Yang Song and Stefano Ermon. 2019. Generative modeling by estimating gradients of the data distribution, Vol. 32."},{"key":"e_1_3_2_1_51_1","volume-title":"International Conference on Learning Representations.","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2020. Score-Based Generative Modeling through Stochastic Differential Equations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_52_1","volume-title":"Int. Conf. on Learn. Represent., https:\/\/openreview.net\/forum?id=PxTIG12RRHS","author":"Song Yang","year":"2021","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2021b. Score-Based Generative Modeling through Stochastic Differential Equations. In Int. Conf. on Learn. Represent., https:\/\/openreview.net\/forum?id=PxTIG12RRHS"},{"key":"e_1_3_2_1_53_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_54_1","volume-title":"Adv. in Neural Inform. Process. Syst.","volume":"30","author":"Den Oord Aaron Van","year":"2017","unstructured":"Aaron Van Den Oord, Oriol Vinyals, et al., 2017. Neural discrete representation learning. Adv. in Neural Inform. Process. Syst., Vol. 30 (2017)."},{"key":"e_1_3_2_1_55_1","volume-title":"A connection between score matching and denoising autoencoders. Neural computation","author":"Vincent Pascal","year":"2011","unstructured":"Pascal Vincent. 2011. A connection between score matching and denoising autoencoders. Neural computation, Vol. 23, 7 (2011), 1661-1674."},{"key":"e_1_3_2_1_56_1","first-page":"23371","article-title":"MCVD - Masked Conditional Video Diffusion for Prediction, Generation, and Interpolation","volume":"35","author":"Voleti Vikram","year":"2022","unstructured":"Vikram Voleti, Alexia Jolicoeur-Martineau, and Christopher Pal. 2022. MCVD - Masked Conditional Video Diffusion for Prediction, Generation, and Interpolation. In Adv. in Neural Inform. Process. Syst., Vol. 35. 23371-23385.","journal-title":"Adv. in Neural Inform. Process. Syst."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-01144-2"},{"key":"e_1_3_2_1_59_1","volume-title":"Diffusion-based scene graph to image generation with masked contrastive pre-training. arXiv preprint arXiv:2211.11138","author":"Yang Ling","year":"2022","unstructured":"Ling Yang, Zhilin Huang, Yang Song, Shenda Hong, Guohao Li, Wentao Zhang, Bin Cui, Bernard Ghanem, and Ming-Hsuan Yang. 2022a. Diffusion-based scene graph to image generation with masked contrastive pre-training. arXiv preprint arXiv:2211.11138 (2022)."},{"key":"e_1_3_2_1_60_1","volume-title":"NTIRE 2021 challenge on quality enhancement of compressed video: Dataset and study. In Proc. of the IEEE Conf. on Comput. Vis. and Pattern Recog., 667-676","author":"Yang Ren","year":"2021","unstructured":"Ren Yang. 2021. NTIRE 2021 challenge on quality enhancement of compressed video: Dataset and study. In Proc. of the IEEE Conf. on Comput. Vis. and Pattern Recog., 667-676."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/214"},{"key":"e_1_3_2_1_62_1","first-page":"13294","article-title":"Resshift: Efficient diffusion model for image super-resolution by residual shifting","volume":"36","author":"Yue Zongsheng","year":"2023","unstructured":"Zongsheng Yue, Jianyi Wang, and Chen Change Loy. 2023. Resshift: Efficient diffusion model for image super-resolution by residual shifting. Advances in Neural Information Processing Systems, Vol. 36 (2023), 13294-13307.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25465"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCP51581.2021.9466265"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00108"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754776","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:12Z","timestamp":1765342752000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754776"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":66,"alternative-id":["10.1145\/3746027.3754776","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754776","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}