{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:33:00Z","timestamp":1777656780200,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["2022YFB3104703"],"award-info":[{"award-number":["2022YFB3104703"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62172103"],"award-info":[{"award-number":["62172103"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681634","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"6870-6879","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":17,"title":["Hi3D: Pursuing High-Resolution Image-to-3D Generation with Video Diffusion Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-6521-2145","authenticated-orcid":false,"given":"Haibo","family":"Yang","sequence":"first","affiliation":[{"name":"School of Computer Science, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9058-5051","authenticated-orcid":false,"given":"Yang","family":"Chen","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4344-8898","authenticated-orcid":false,"given":"Yingwei","family":"Pan","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7587-101X","authenticated-orcid":false,"given":"Ting","family":"Yao","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1543-6889","authenticated-orcid":false,"given":"Zhineng","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4182-8261","authenticated-orcid":false,"given":"Chong-Wah","family":"Ngo","sequence":"additional","affiliation":[{"name":"Singapore Management University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5990-7307","authenticated-orcid":false,"given":"Tao","family":"Mei","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets. arXiv preprint arXiv:2311.15127","author":"Blattmann Andreas","year":"2023","unstructured":"Andreas Blattmann, Tim Dockhorn, Sumith Kulal, Daniel Mendelevitch, Maciej Kilian, Dominik Lorenz, Yam Levi, Zion English, Vikram Voleti, Adam Letts, Varun Jampani, and Robin Rombach. 2023. Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets. arXiv preprint arXiv:2311.15127 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Sanja Fidler, and Karsten Kreis.","author":"Blattmann Andreas","year":"2023","unstructured":"Andreas Blattmann, Robin Rombach, Huan Ling, Tim Dockhorn, Seung Wook Kim, Sanja Fidler, and Karsten Kreis. 2023. Align your Latents: High-Resolution Video Synthesis with Latent Diffusion Models. In CVPR."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Hansheng Chen Jiatao Gu Anpei Chen Wei Tian Zhuowen Tu Lingjie Liu and Hao Su. 2023. Single-Stage Diffusion NeRF: A Unified Approach to 3D Generation and Reconstruction. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00229"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Yang Chen Jingwen Chen Yingwei Pan Xinmei Tian and Tao Mei. 2023. 3D Creation at Your Fingertips: From Text or Image to 3D Assets. In ACM MM.","DOI":"10.1145\/3581783.3612678"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Yang Chen Yingwei Pan Yehao Li Ting Yao and Tao Mei. 2023. Control3d: Towards controllable text-to-3d generation. In ACM MM.","DOI":"10.1145\/3581783.3612489"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Yang Chen Yingwei Pan Haibo Yang Ting Yao and Tao Mei. 2024. Vp3d: Unleashing 2d visual prompt for text-to-3d generation. In CVPR.","DOI":"10.1109\/CVPR52733.2024.00468"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Yang Chen Yingwei Pan Ting Yao Xinmei Tian and Tao Mei. 2019. Animating Your Life: Real-Time Video-to-Animation Translation. In ACM MM.","DOI":"10.1145\/3343031.3350593"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350937"},{"key":"e_1_3_2_1_9_1","volume-title":"V3d: Video diffusion models are effective 3d generators. arXiv preprint arXiv:2403.06738","author":"Chen Zilong","year":"2024","unstructured":"Zilong Chen, Yikai Wang, Feng Wang, Zhengyi Wang, and Huaping Liu. 2024. V3d: Video diffusion models are effective 3d generators. arXiv preprint arXiv:2403.06738 (2024)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Yen-Chi Cheng Hsin-Ying Lee Sergey Tulyakov Alexander G Schwing and Liang-Yan Gui. 2023. SDFusion: Multimodal 3d shape completion reconstruction and generation. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00433"},{"key":"e_1_3_2_1_11_1","volume-title":"Eli VanderBilt, Aniruddha Kembhavi","author":"Deitke Matt","year":"2023","unstructured":"Matt Deitke, Ruoshi Liu, Matthew Wallingford, Huong Ngo, Oscar Michel, Aditya Kusupati, Alan Fan, Christian Laforte, Vikram Voleti, Samir Yitzhak Gadre, Eli VanderBilt, Aniruddha Kembhavi, Carl Vondrick, Georgia Gkioxari, Kiana Ehsani, Ludwig Schmidt, and Ali Farhadi. 2023. Objaverse-XL: A Universe of 10M 3D Objects. In NeurIPS."},{"key":"e_1_3_2_1_12_1","volume-title":"Objaverse: A universe of annotated 3d objects. In CVPR.","author":"Deitke Matt","year":"2023","unstructured":"Matt Deitke, Dustin Schwenk, Jordi Salvador, Luca Weihs, Oscar Michel, Eli VanderBilt, Ludwig Schmidt, Kiana Ehsani, Aniruddha Kembhavi, and Ali Farhadi. 2023. Objaverse: A universe of annotated 3d objects. In CVPR."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Kangle Deng Andrew Liu Jun-Yan Zhu and Deva Ramanan. 2022. Depth-supervised NeRF: Fewer Views and Faster Training for Free. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01254"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Laura Downs Anthony Francis Nate Koenig Brandon Kinman Ryan Hickman Krista Reymann Thomas B McHugh and Vincent Vanhoucke. 2022. Google scanned objects: A high-quality dataset of 3d scanned household items. In ICRA.","DOI":"10.1109\/ICRA46639.2022.9811809"},{"key":"e_1_3_2_1_15_1","unstructured":"Qiancheng Fu Qingshan Xu Yew-Soon Ong and Wenbing Tao. 2022. Geo-Neus: Geometry-Consistent Neural Implicit Surfaces Learning for Multi-view Reconstruction. In NeurIPS."},{"key":"e_1_3_2_1_16_1","unstructured":"Yuan-Chen Guo. 2022. Instant Neural Surface Reconstruction. https:\/\/github.com\/bennyguo\/instant-nsr-pl."},{"key":"e_1_3_2_1_17_1","volume-title":"Vfusion3d: Learning scalable 3d generative models from video diffusion models. arXiv preprint arXiv:2403.12034","author":"Han Junlin","year":"2024","unstructured":"Junlin Han, Filippos Kokkinos, and Philip Torr. 2024. Vfusion3d: Learning scalable 3d generative models from video diffusion models. arXiv preprint arXiv:2403.12034 (2024)."},{"key":"e_1_3_2_1_18_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In NeurIPS."},{"key":"e_1_3_2_1_19_1","volume-title":"NeurIPS Workshop.","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. In NeurIPS Workshop."},{"key":"e_1_3_2_1_20_1","volume":"202","author":"Ho Jonathan","unstructured":"Jonathan Ho, Tim Salimans, Alexey Gritsenko, William Chan, Mohammad Norouzi, and David J Fleet. 2022. Video diffusion models. In NeurIPS.","journal-title":"David J Fleet."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Zehuan Huang Hao Wen Junting Dong Yaohui Wang Yangguang Li Xinyuan Chen Yan-Pei Cao Ding Liang Yu Qiao Bo Dai and Lu Sheng. 2024. EpiDiff: Enhancing Multi-View Synthesis via Localized Epipolar-Constrained Diffusion. In CVPR.","DOI":"10.1109\/CVPR52733.2024.00934"},{"key":"e_1_3_2_1_22_1","volume-title":"Shap-e: Generating conditional 3d implicit functions. arXiv preprint arXiv:2305.02463","author":"Jun Heewoo","year":"2023","unstructured":"Heewoo Jun and Alex Nichol. 2023. Shap-e: Generating conditional 3d implicit functions. arXiv preprint arXiv:2305.02463 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"3D Gaussian Splatting for Real-Time Radiance Field Rendering. TOG","author":"Kerbl Bernhard","year":"2023","unstructured":"Bernhard Kerbl, Georgios Kopanas, Thomas Leimk\u00fchler, and George Drettakis. 2023. 3D Gaussian Splatting for Real-Time Radiance Field Rendering. TOG (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Varun Jampani, Ming-Hsuan Yang, and Jan Kautz.","author":"Li Xueting","year":"2020","unstructured":"Xueting Li, Sifei Liu, Kihwan Kim, Shalini De Mello, Varun Jampani, Ming-Hsuan Yang, and Jan Kautz. 2020. Self-supervised single-view 3d reconstruction via semantic consistency. In ECCV."},{"key":"e_1_3_2_1_25_1","first-page":"2","volume":"202","author":"Liu Minghua","unstructured":"Minghua Liu, Chao Xu, Haian Jin, Linghao Chen, Zexiang Xu, and Hao Su. 2023. One-2--3--45: Any Single Image to 3D Mesh in 45 Seconds without Per-Shape Optimization. In NeurIPS.","journal-title":"Hao Su."},{"key":"e_1_3_2_1_26_1","volume-title":"Pavel Tokmakov, Sergey Zakharov, and Carl Vondrick.","author":"Liu Ruoshi","year":"2023","unstructured":"Ruoshi Liu, Rundi Wu, Basile Van Hoorick, Pavel Tokmakov, Sergey Zakharov, and Carl Vondrick. 2023. Zero-1-to-3: Zero-shot one image to 3d object. In ICCV."},{"key":"e_1_3_2_1_27_1","unstructured":"Yuan Liu Cheng Lin Zijiao Zeng Xiaoxiao Long Lingjie Liu Taku Komura and Wenping Wang. 2024. SyncDreamer: Generating Multiview-consistent Images from a Single-view Image. In ICLR."},{"key":"e_1_3_2_1_28_1","unstructured":"Zhen Liu Yao Feng Michael J Black Derek Nowrouzezahrai Liam Paull and Weiyang Liu. 2023. MeshDiffusion: Score-based generative 3d mesh modeling. In ICLR."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Xiaoxiao Long Yuan-Chen Guo Cheng Lin Yuan Liu Zhiyang Dou Lingjie Liu Yuexin Ma Song-Hai Zhang Marc Habermann Christian Theobalt et al. 2024. Wonder3D: Single Image to 3D using Cross-Domain Diffusion. In CVPR.","DOI":"10.1109\/CVPR52733.2024.00951"},{"key":"e_1_3_2_1_30_1","volume-title":"Sparseneus: Fast generalizable neural surface reconstruction from sparse views. In ECCV.","author":"Long Xiaoxiao","year":"2022","unstructured":"Xiaoxiao Long, Cheng Lin, Peng Wang, Taku Komura, and Wenping Wang. 2022. Sparseneus: Fast generalizable neural surface reconstruction from sparse views. In ECCV."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Luke Melas-Kyriazi Iro Laina Christian Rupprecht and Andrea Vedaldi. 2023. Realfusion: 360deg reconstruction of any object from a single image. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00816"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Ben Mildenhall Pratul P Srinivasan Matthew Tancik Jonathan T Barron Ravi Ramamoorthi and Ren Ng. 2020. NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis. In ECCV.","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_2_1_33_1","volume-title":"Instant Neural Graphics Primitives with a Multiresolution Hash Encoding. TOG","author":"M\u00fcller Thomas","year":"2022","unstructured":"Thomas M\u00fcller, Alex Evans, Christoph Schied, and Alexander Keller. 2022. Instant Neural Graphics Primitives with a Multiresolution Hash Encoding. TOG (2022)."},{"key":"e_1_3_2_1_34_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In PMLR.","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In PMLR."},{"key":"e_1_3_2_1_35_1","volume-title":"Point-e: A system for generating 3d point clouds from complex prompts. arXiv preprint arXiv:2212.08751","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Heewoo Jun, Prafulla Dhariwal, Pamela Mishkin, and Mark Chen. 2022. Point-e: A system for generating 3d point clouds from complex prompts. arXiv preprint arXiv:2212.08751 (2022)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Michael Niemeyer Jonathan T. Barron Ben Mildenhall Mehdi S. M. Sajjadi Andreas Geiger and Noha Radwan. 2022. RegNeRF: Regularizing Neural Radiance Fields for View Synthesis from Sparse Inputs. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00540"},{"key":"e_1_3_2_1_37_1","unstructured":"Yingwei Pan Zhaofan Qiu Ting Yao Houqiang Li and Tao Mei. 2017. To create what you tell: Generating videos from captions. In ACM Multimedia."},{"key":"e_1_3_2_1_38_1","volume-title":"Dreamfusion: Text-to-3d using 2d diffusion. In ICLR.","author":"Poole Ben","year":"2023","unstructured":"Ben Poole, Ajay Jain, Jonathan T Barron, and Ben Mildenhall. 2023. Dreamfusion: Text-to-3d using 2d diffusion. In ICLR."},{"key":"e_1_3_2_1_39_1","unstructured":"Tianhao Qi Shancheng Fang Yanze Wu Hongtao Xie Jiawei Liu Lang Chen Qian He and Yongdong Zhang. 2024. DEADiff: An Efficient Stylization Diffusion Model with Disentangled Representations. In CVPR."},{"key":"e_1_3_2_1_40_1","unstructured":"Guocheng Qian Jinjie Mai Abdullah Hamdi Jian Ren Aliaksandr Siarohin Bing Li Hsin-Ying Lee Ivan Skorokhodov Peter Wonka Sergey Tulyakov et al. 2024. Magic123: One Image to High-Quality 3D Object Generation Using Both 2D and 3D Diffusion Priors. In ICLR."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Yurui Qian Qi Cai Yingwei Pan Yehao Li Ting Yao Qibin Sun and Tao Mei. 2024. Boosting Diffusion Models with Moving Average Sampling in Frequency Domain. In CVPR.","DOI":"10.1109\/CVPR52733.2024.00851"},{"key":"e_1_3_2_1_42_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Amit Raj Srinivas Kaza Ben Poole Michael Niemeyer Nataniel Ruiz Ben Mildenhall Shiran Zada Kfir Aberman Michael Rubinstein Jonathan Barron et al. 2023. Dreambooth3d: Subject-driven text-to-3d generation. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00223"},{"key":"e_1_3_2_1_44_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_45_1","volume-title":"Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer. TPAMI","author":"Ranftl Ren\u00e9","year":"2020","unstructured":"Ren\u00e9 Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, and Vladlen Koltun. 2020. Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer. TPAMI (2020)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_47_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. In NeurIPS."},{"key":"e_1_3_2_1_48_1","volume-title":"Zero123: a Single Image to Consistent Multi-view Diffusion Base Model. arXiv preprint arXiv:2310.15110","author":"Shi Ruoxi","year":"2023","unstructured":"Ruoxi Shi, Hansheng Chen, Zhuoyang Zhang, Minghua Liu, Chao Xu, Xinyue Wei, Linghao Chen, Chong Zeng, and Hao Su. 2023. Zero123: a Single Image to Consistent Multi-view Diffusion Base Model. arXiv preprint arXiv:2310.15110 (2023)."},{"key":"e_1_3_2_1_49_1","unstructured":"Yichun Shi Peng Wang Jianglong Ye Long Mai Kejie Li and Xiao Yang. 2024. MVDream: Multi-view Diffusion for 3D Generation. In ICLR."},{"key":"e_1_3_2_1_50_1","volume-title":"Visual Text Meets Low-level Vision: A Comprehensive Survey on Visual Text Processing. arXiv preprint arXiv:2402.03082","author":"Shu Yan","year":"2024","unstructured":"Yan Shu, Weichao Zeng, Zhenhang Li, Fangmin Zhao, and Yu Zhou. 2024. Visual Text Meets Low-level Vision: A Comprehensive Survey on Visual Text Processing. arXiv preprint arXiv:2402.03082 (2024)."},{"key":"e_1_3_2_1_51_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2021. Denoising diffusion implicit models. In ICLR."},{"key":"e_1_3_2_1_52_1","unstructured":"StabilityAI. 2023. Stable Zero123. https:\/\/stability.ai\/news\/stable-zero123--3d-generation."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Stanislaw Szymanowicz Christian Rupprecht and Andrea Vedaldi. 2023. Viewset Diffusion:(0-) Image-Conditioned 3D Generative Models from 2D Data. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00814"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Junshu Tang Tengfei Wang Bo Zhang Ting Zhang Ran Yi Lizhuang Ma and Dong Chen. 2023. Make-it-3d: High-fidelity 3d creation from a single image with diffusion prior. In ICCV.","DOI":"10.1109\/ICCV51070.2023.02086"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Maxim Tatarchenko Stephan R Richter Ren\u00e9 Ranftl Zhuwen Li Vladlen Koltun and Thomas Brox. 2019. What do single-view 3d reconstruction networks learn?. In CVPR.","DOI":"10.1109\/CVPR.2019.00352"},{"key":"e_1_3_2_1_56_1","volume-title":"Sv3d: Novel multi-view synthesis and 3d generation from a single image using latent video diffusion. arXiv preprint arXiv:2403.12008","author":"Voleti Vikram","year":"2024","unstructured":"Vikram Voleti, Chun-Han Yao, Mark Boss, Adam Letts, David Pankratz, Dmitry Tochilkin, Christian Laforte, Robin Rombach, and Varun Jampani. 2024. Sv3d: Novel multi-view synthesis and 3d generation from a single image using latent video diffusion. arXiv preprint arXiv:2403.12008 (2024)."},{"key":"e_1_3_2_1_57_1","unstructured":"Peng Wang Lingjie Liu Yuan Liu Christian Theobalt Taku Komura and Wenping Wang. 2021. NeuS: Learning Neural Implicit Surfaces by Volume Rendering for Multi-view Reconstruction. In NeurIPS."},{"key":"e_1_3_2_1_58_1","volume-title":"Real-esrgan: Training real-world blind super-resolution with pure synthetic data. In ICCVW.","author":"Wang Xintao","year":"2021","unstructured":"Xintao Wang, Liangbin Xie, Chao Dong, and Ying Shan. 2021. Real-esrgan: Training real-world blind super-resolution with pure synthetic data. In ICCVW."},{"key":"e_1_3_2_1_59_1","volume-title":"Image quality assessment: from error visibility to structural similarity. TIP","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. TIP (2004)."},{"key":"e_1_3_2_1_60_1","volume-title":"A survey on video diffusion models. arXiv preprint arXiv:2310.10647","author":"Xing Zhen","year":"2023","unstructured":"Zhen Xing, Qijun Feng, Haoran Chen, Qi Dai, Han Hu, Hang Xu, Zuxuan Wu, and Yu-Gang Jiang. 2023. A survey on video diffusion models. arXiv preprint arXiv:2310.10647 (2023)."},{"key":"e_1_3_2_1_61_1","unstructured":"Dejia Xu Yifan Jiang Peihao Wang Zhiwen Fan Yi Wang and Zhangyang Wang. 2023. Neurallift-360: Lifting an in-the-wild 2d photo to a 3d object with 360 views. In CVPR."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Haibo Yang Yang Chen Yingwei Pan Ting Yao Zhineng Chen and Tao Mei. 2023. 3dstyle-diffusion: Pursuing fine-grained text-driven 3d stylization with 2d diffusion models. In ACM MM.","DOI":"10.1145\/3581783.3612363"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Haibo Yang Yang Chen Yingwei Pan Ting Yao Zhineng Chen Zuxuan Wu Yu-gang Jiang and Tao Mei. 2024. DreamMesh: Jointly manipulating and texturing triangle meshes for text-to-3d generation. In ECCV.","DOI":"10.1007\/978-3-031-73202-7_10"},{"key":"e_1_3_2_1_64_1","volume-title":"Differentiable Surface Splatting for Point-based Geometry Processing. TOG","author":"Yifan Wang","year":"2019","unstructured":"Wang Yifan, Felice Serena, Shihao Wu, Cengiz \u00d6ztireli, and Olga Sorkine-Hornung. 2019. Differentiable Surface Splatting for Point-based Geometry Processing. TOG (2019)."},{"key":"e_1_3_2_1_65_1","volume-title":"LION: Latent Point Diffusion Models for 3D Shape Generation. In NeurIPS.","author":"Zeng Xiaohui","year":"2022","unstructured":"Xiaohui Zeng, Arash Vahdat, Francis Williams, Zan Gojcic, Or Litany, Sanja Fidler, and Karsten Kreis. 2022. LION: Latent Point Diffusion Models for 3D Shape Generation. In NeurIPS."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"Biao Zhang Jiapeng Tang Matthias Niessner and Peter Wonka. 2023. 3dshape2vecset: A 3d shape representation for neural fields and generative diffusion models. In SIGGRAPH.","DOI":"10.1145\/3592442"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"crossref","unstructured":"Richard Zhang Phillip Isola Alexei A Efros Eli Shechtman and Oliver Wang. 2018. The Unreasonable Effectiveness of Deep Features as a Perceptual Metric. In CVPR.","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_68_1","volume-title":"TRIP: Temporal Residual Learning with Image Noise Prior for Image-to-Video Diffusion Models. In CVPR.","author":"Zhang Zhongwei","year":"2024","unstructured":"Zhongwei Zhang, Fuchen Long, Yingwei Pan, Zhaofan Qiu, Ting Yao, Yang Cao, and Tao Mei. 2024. TRIP: Temporal Residual Learning with Image Noise Prior for Image-to-Video Diffusion Models. In CVPR."},{"key":"e_1_3_2_1_69_1","volume-title":"Sd-dit: Unleashing the power of self-supervised discrimination in diffusion transformer. In CVPR.","author":"Zhu Rui","year":"2024","unstructured":"Rui Zhu, Yingwei Pan, Yehao Li, Ting Yao, Zhenglong Sun, Tao Mei, and Chang Wen Chen. 2024. Sd-dit: Unleashing the power of self-supervised discrimination in diffusion transformer. In CVPR."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681634","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681634","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681634"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":69,"alternative-id":["10.1145\/3664647.3681634","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681634","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}