{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:55:28Z","timestamp":1781538928442,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Shanghai Municipal Science and Technology Major\/Key Project","award":["2021SHZDZX0102"],"award-info":[{"award-number":["2021SHZDZX0102"]}]},{"name":"National Natural Science Foundation of China","award":["62076162"],"award-info":[{"award-number":["62076162"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810569","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1213-1221","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["HM-NVS: Hierarchical Multi-Modal Novel View Synthesis\\\\with Uncertainty-Aware Progressive Refinement"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-8884-5700","authenticated-orcid":false,"given":"Jiahao","family":"Chang","sequence":"first","affiliation":[{"name":"Shanghai Jiaotong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2414-4362","authenticated-orcid":false,"given":"Haohua","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Jiaotong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7597-8503","authenticated-orcid":false,"given":"Liqing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiaotong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00539"},{"key":"e_1_3_3_1_3_2","unstructured":"Shariq\u00a0Farooq Bhat Reiner Birkl Diana Wofk Peter Wonka and Matthias Muller. 2023. ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.12288 (2023)."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_3_1_5_2","volume-title":"ICLR","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In ICLR."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811809"},{"key":"e_1_3_3_1_7_2","volume-title":"ICML","author":"Gal Yarin","year":"2016","unstructured":"Yarin Gal and Zoubin Ghahramani. 2016. Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning. In ICML."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Yizeng Han Gao Huang Shiji Song Le Yang Honghui Wang and Yulin Wang. 2021. Dynamic Neural Networks: A Survey. IEEE Transactions on Pattern Analysis and Machine Intelligence 44 11 (2021) 7436\u20137456.","DOI":"10.1109\/TPAMI.2021.3117837"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.5555\/861369"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_11_2","volume-title":"NeurIPS","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. In NeurIPS."},{"key":"e_1_3_3_1_12_2","volume-title":"NeurIPS","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In NeurIPS."},{"key":"e_1_3_3_1_13_2","unstructured":"Heewoo Jun and Alex Nichol. 2023. Shap-E: Generating Conditional 3D Implicit Functions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.02463 (2023)."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_23"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Kevin Karsch Ce Liu and Sing\u00a0Bing Kang. 2014. Depth Transfer: Depth Extraction from Video Using Non-parametric Sampling. IEEE Transactions on Pattern Analysis and Machine Intelligence 36 11 (2014) 2144\u20132158.","DOI":"10.1109\/TPAMI.2014.2316835"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Bernhard Kerbl Georgios Kopanas Thomas Leimk\u00fchler and George Drettakis. 2023. 3D Gaussian Splatting for Real-Time Radiance Field Rendering. ACM Transactions on Graphics (SIGGRAPH) 42 4 (2023).","DOI":"10.1145\/3592433"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_1_18_2","volume-title":"NeurIPS","author":"Lakshminarayanan Balaji","year":"2017","unstructured":"Balaji Lakshminarayanan, Alexander Pritzel, and Charles Blundell. 2017. Simple and Scalable Predictive Uncertainty Estimation using Deep Ensembles. In NeurIPS."},{"key":"e_1_3_3_1_19_2","volume-title":"ICML","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In ICML."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00960"},{"key":"e_1_3_3_1_23_2","volume-title":"NeurIPS","author":"Liu Minghua","year":"2023","unstructured":"Minghua Liu, Chao Xu, Haian Jin, Linghao Chen, Mukund Varma\u00a0T, Zexiang Xu, and Hao Su. 2023. One-2-3-45: Any Single Image to 3D Mesh in 45 Seconds without Per-shape Optimization. In NeurIPS."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"e_1_3_3_1_25_2","volume-title":"ECCV","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et\u00a0al. 2024. Grounding DINO: Marrying DINO with Grounded Pre-training for Open-Set Object Detection. In ECCV."},{"key":"e_1_3_3_1_26_2","volume-title":"ICLR","author":"Liu Yuan","year":"2024","unstructured":"Yuan Liu, Cheng Lin, Zijiao Zeng, Xiaoxiao Long, Lingjie Liu, Taku Komura, and Wenping Wang. 2024. SyncDreamer: Generating Multiview-consistent Images from a Single-view Image. In ICLR."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00951"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Ben Mildenhall Pratul\u00a0P. Srinivasan Matthew Tancik Jonathan\u00a0T. Barron Ravi Ramamoorthi and Ren Ng. 2021. NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis. Commun. ACM 65 1 (2021) 99\u2013106.","DOI":"10.1145\/3503250"},{"key":"e_1_3_3_1_29_2","unstructured":"Alex Nichol Heewoo Jun Prafulla Dhariwal Pamela Mishkin and Mark Chen. 2022. Point-E: A System for Generating 3D Point Clouds from Complex Prompts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.08751 (2022)."},{"key":"e_1_3_3_1_30_2","volume-title":"NeurIPS","author":"Ning Qian","year":"2021","unstructured":"Qian Ning, Weisheng Dong, Guangming Shi, Leida Li, and Xin Li. 2021. Uncertainty-driven Loss for Single Image Super-resolution. In NeurIPS."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/3DV62453.2024.00026"},{"key":"e_1_3_3_1_32_2","volume-title":"ICLR","author":"Poole Ben","year":"2023","unstructured":"Ben Poole, Ajay Jain, Jonathan\u00a0T. Barron, and Ben Mildenhall. 2023. DreamFusion: Text-to-3D using 2D Diffusion. In ICLR."},{"key":"e_1_3_3_1_33_2","volume-title":"ICML","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models from Natural Language Supervision. In ICML."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_1_36_2","unstructured":"Ruoxi Shi Hansheng Chen Zhuoyang Zhang Minghua Liu Chao Xu Xinyue Wei Linghao Chen Chong Zeng and Hao Su. 2023. Zero123++: A Single Image to Consistent Multi-view Diffusion Base Model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.15110 (2023)."},{"key":"e_1_3_3_1_37_2","volume-title":"CVPR","author":"Shi Yukai","year":"2024","unstructured":"Yukai Shi, Robin Clark, Sai Bi, Kalyan Sunkavalli, and Hao Su. 2024. TOSS: High-quality Text-guided Novel View Synthesis from a Single Image. In CVPR."},{"key":"e_1_3_3_1_38_2","unstructured":"Yichun Shi Peng Wang Jianglong Ye Mai Long Kejie Li and Xiao Yang. 2023. MVDream: Multi-view Diffusion for 3D Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.16512 (2023)."},{"key":"e_1_3_3_1_39_2","volume-title":"ICLR","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In ICLR."},{"key":"e_1_3_3_1_40_2","volume-title":"ECCV","author":"Tang Jiaxiang","year":"2024","unstructured":"Jiaxiang Tang, Zhaoxi Chen, Xiaokang Chen, Tengfei Wang, Gang Zeng, and Ziwei Liu. 2024. LGM: Large Multi-view Gaussian Model for High-Resolution 3D Content Creation. In ECCV."},{"key":"e_1_3_3_1_41_2","volume-title":"NeurIPS","author":"Tang Shitao","year":"2023","unstructured":"Shitao Tang, Fuyang Zhang, Jiacheng Chen, Peng Wang, and Yasutaka Furukawa. 2023. MVDiffusion: Enabling Holistic Multi-view Image Generation with Correspondence-aware Diffusion. In NeurIPS."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2016.7900006"},{"key":"e_1_3_3_1_43_2","volume-title":"NeurIPS","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N. Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. In NeurIPS."},{"key":"e_1_3_3_1_44_2","volume-title":"ECCV","author":"Voleti Vikram","year":"2024","unstructured":"Vikram Voleti, Chun-Han Xu, Daniil Turmukhambetov, Federica Bogo, Diogo Luvizon, Richard Tucker, Andrea Vedaldi, and David Novotny. 2024. SV3D: Novel Multi-view Synthesis and 3D Generation from a Single Image using Latent Video Diffusion. In ECCV."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Guotai Wang Wenqi Li Michael Aertsen Jan Deprest S\u00e9bastien Ourselin and Tom Vercauteren. 2019. Aleatoric Uncertainty Estimation with Test-Time Augmentation for Medical Image Segmentation with Convolutional Neural Networks. Neurocomputing 338 (2019) 34\u201345.","DOI":"10.1016\/j.neucom.2019.01.103"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01214"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Zhou Wang Alan\u00a0C. Bovik Hamid\u00a0R. Sheikh and Eero\u00a0P. Simoncelli. 2004. Image Quality Assessment: From Error Visibility to Structural Similarity. IEEE Transactions on Image Processing 13 4 (2004) 600\u2013612.","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_3_1_48_2","unstructured":"Haohan Weng Tianyu Yang Jianan Wang Yu Li Tong Zhang C.\u00a0L.\u00a0Philip Chen and Lei Shao. 2023. Consistent123: Improve Consistency for One Image to 3D Object Synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.08092 (2023)."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00084"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_47"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00455"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:17:11Z","timestamp":1781536631000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810569"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":51,"alternative-id":["10.1145\/3805622.3810569","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810569","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}