{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T16:37:13Z","timestamp":1771951033667,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612478","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"7890-7900","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["Hierarchical Masked 3D Diffusion Model for Video Outpainting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5214-0959","authenticated-orcid":false,"given":"Fanda","family":"Fan","sequence":"first","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7685-0476","authenticated-orcid":false,"given":"Chaoxu","family":"Guo","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5698-6240","authenticated-orcid":false,"given":"Litong","family":"Gong","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5112-987X","authenticated-orcid":false,"given":"Biao","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1381-2692","authenticated-orcid":false,"given":"Tiezheng","family":"Ge","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1665-3025","authenticated-orcid":false,"given":"Yuning","family":"Jiang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6977-929X","authenticated-orcid":false,"given":"Chunjie","family":"Luo","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3728-6837","authenticated-orcid":false,"given":"Jianfeng","family":"Zhan","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In IEEE International Conference on Computer Vision.","author":"Bain Max","year":"2021","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In IEEE International Conference on Computer Vision."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_2_1_3_1","volume-title":"Efros","author":"Brooks Tim","year":"2023","unstructured":"Tim Brooks, Aleksander Holynski, and Alexei A. Efros. 2023. InstructPix2Pix: Learning to Follow Image Editing Instructions. In CVPR."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01114"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00084"},{"key":"e_1_3_2_1_7_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_8_1","first-page":"8780","article-title":"Diffusion models beat gans on image synthesis","volume":"34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in Neural Information Processing Systems 34 (2021), 8780--8794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.316"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"e_1_3_2_1_15_1","volume-title":"Maskvit: Masked visual pre-training for video prediction. arXiv preprint arXiv:2206.11894","author":"Gupta Agrim","year":"2022","unstructured":"Agrim Gupta, Stephen Tian, Yunzhi Zhang, Jiajun Wu, Roberto Mart\u00edn-Mart\u00edn, and Li Fei-Fei. 2022. Maskvit: Masked visual pre-training for video prediction. arXiv preprint arXiv:2206.11894 (2022)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_17_1","volume-title":"Latent Video Diffusion Models for High-Fidelity Video Generation with Arbitrary Lengths. arXiv preprint arXiv:2211.13221","author":"He Yingqing","year":"2022","unstructured":"Yingqing He, Tianyu Yang, Yong Zhang, Ying Shan, and Qifeng Chen. 2022. Latent Video Diffusion Models for High-Fidelity Video Generation with Arbitrary Lengths. arXiv preprint arXiv:2211.13221 (2022)."},{"key":"e_1_3_2_1_18_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik P Kingma Ben Poole Mohammad Norouzi David J Fleet et al. 2022. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_1_19_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems 33 (2020), 6840--6851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_20_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_21_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00090"},{"key":"e_1_3_2_1_23_1","volume-title":"On the effectiveness of task granularity for transfer learning. arXiv preprint arXiv:1804.09235","author":"Mahdisoltani Farzaneh","year":"2018","unstructured":"Farzaneh Mahdisoltani, Guillaume Berger, Waseem Gharbieh, David Fleet, and Roland Memisevic. 2018. On the effectiveness of task granularity for transfer learning. arXiv preprint arXiv:1804.09235 (2018)."},{"key":"e_1_3_2_1_24_1","volume-title":"Yossi Matias, Yael Pritch, Yaniv Leviathan, and Yedid Hoshen.","author":"Molad Eyal","year":"2023","unstructured":"Eyal Molad, Eliahu Horwitz, Dani Valevski, Alex Rav Acha, Yossi Matias, Yael Pritch, Yaniv Leviathan, and Yedid Hoshen. 2023. Dreamix: Video diffusion models are general video editors. arXiv preprint arXiv:2302.01329 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 8162--8171","author":"Nichol Alexander Quinn","year":"2021","unstructured":"Alexander Quinn Nichol and Prafulla Dhariwal. 2021. Improved denoising diffusion probabilistic models. In International Conference on Machine Learning. PMLR, 8162--8171."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"e_1_3_2_1_27_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530757"},{"key":"e_1_3_2_1_30_1","volume-title":"Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et al. 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"International Conference on Machine Learning. PMLR, 2256--2265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International Conference on Machine Learning. PMLR, 2256--2265."},{"key":"e_1_3_2_1_32_1","volume-title":"Design Booster: A Text-Guided Diffusion Model for Image Translation with Spatial Layout Preservation. arXiv preprint arXiv:2302.02284","author":"Sun Shiqi","year":"2023","unstructured":"Shiqi Sun, Shancheng Fang, Qian He, and Wei Liu. 2023. Design Booster: A Text-Guided Diffusion Model for Image Translation with Spatial Layout Preservation. arXiv preprint arXiv:2302.02284 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Raft: Recurrent all-pairs field transforms for optical flow. In Computer Vision--ECCV 2020: 16th European Conference","author":"Teed Zachary","year":"2020","unstructured":"Zachary Teed and Jia Deng. 2020. Raft: Recurrent all-pairs field transforms for optical flow. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part II 16. Springer, 402--419."},{"key":"e_1_3_2_1_34_1","volume-title":"Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly.","author":"Unterthiner Thomas","year":"2018","unstructured":"Thomas Unterthiner, Sjoerd Van Steenkiste, Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly. 2018. Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:1812.01717 (2018)."},{"key":"e_1_3_2_1_35_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_36_1","volume-title":"Masked conditional video diffusion for prediction, generation, and interpolation. arXiv preprint arXiv:2205.09853","author":"Voleti Vikram","year":"2022","unstructured":"Vikram Voleti, Alexia Jolicoeur-Martineau, and Christopher Pal. 2022. Masked conditional video diffusion for prediction, generation, and interpolation. arXiv preprint arXiv:2205.09853 (2022)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3054477"},{"key":"e_1_3_2_1_38_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing 13, 4 (2004), 600--612."},{"key":"e_1_3_2_1_39_1","volume-title":"Diffusion Models as Masked Autoencoders. arXiv preprint arXiv:2304.03283","author":"Wei Chen","year":"2023","unstructured":"Chen Wei, Karttikeya Mangalam, Po-Yao Huang, Yanghao Li, Haoqi Fan, Hu Xu, Huiyu Wang, Cihang Xie, Alan Yuille, and Christoph Feichtenhofer. 2023. Diffusion Models as Masked Autoencoders. arXiv preprint arXiv:2304.03283 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"A large-scale video object segmentation benchmark. arXiv preprint","author":"Xu N","year":"2018","unstructured":"N Xu, L Yang, Y Fan, D Yue, Y Liang, J Yang, and T YouTube-VOS Huang. 2018. A large-scale video object segmentation benchmark. arXiv preprint (2018)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01517"},{"key":"e_1_3_2_1_42_1","unstructured":"Shengming Yin Chenfei Wu Huan Yang Jianfeng Wang Xiaodong Wang Min-heng Ni Zhengyuan Yang Linjie Li Shuguang Liu Fan Yang et al. 2023. NUWA-XL: Diffusion over Diffusion for eXtremely Long Video Generation. arXiv preprint arXiv:2303.12346 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"MAGVIT: Masked Generative Video Transformer. arXiv preprint arXiv:2212.05199","author":"Yu Lijun","year":"2022","unstructured":"Lijun Yu, Yong Cheng, Kihyuk Sohn, Jos\u00e9 Lezama, Han Zhang, Huiwen Chang, Alexander G Hauptmann, Ming-Hsuan Yang, Yuan Hao, Irfan Essa, et al. 2022. MAGVIT: Masked Generative Video Transformer. arXiv preprint arXiv:2212.05199 (2022)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612478","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612478","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:56:46Z","timestamp":1755820606000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612478"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":44,"alternative-id":["10.1145\/3581783.3612478","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612478","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}