{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:24Z","timestamp":1755825024686,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733319","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"542-549","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Efficient Monocular Depth Estimation Via Single-Step Latent Diffusion Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7192-2593","authenticated-orcid":false,"given":"Zhiyong","family":"Huo","sequence":"first","affiliation":[{"name":"Nanjing University of Posts and Telecommunications, Nanjing, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6112-8767","authenticated-orcid":false,"given":"Zhendong","family":"Wang","sequence":"additional","affiliation":[{"name":"Nanjing University of Posts and Telecommunications, Nanjing, Jiangsu, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2972122"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3562939.3565620"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00316"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10342058"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10342058"},{"key":"e_1_3_2_1_6_1","volume-title":"Depth map prediction from a single image using a multi-scale deep network. Advances in neural information processing systems","author":"Eigen David","year":"2014","unstructured":"David Eigen, Christian Puhrsch, and Rob Fergus. 2014. Depth map prediction from a single image using a multi-scale deep network. Advances in neural information processing systems, Vol. 27 (2014)."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 4009--4018","author":"Bhat Shariq Farooq","year":"2021","unstructured":"Shariq Farooq Bhat, Ibraheem Alhashim, and Peter Wonka. 2021. Adabins: Depth estimation using adaptive bins. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 4009--4018."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-023-1458-0"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00389"},{"key":"e_1_3_2_1_10_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00907"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72670-5_14"},{"key":"e_1_3_2_1_14_1","volume-title":"Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer","author":"Ranftl Ren\u00e9","year":"2020","unstructured":"Ren\u00e9 Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, and Vladlen Koltun. 2020. Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer. IEEE transactions on pattern analysis and machine intelligence, Vol. 44, 3 (2020), 1623--1637."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00027"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"e_1_3_2_1_17_1","first-page":"14128","article-title":"Hierarchical normalization for robust monocular depth estimation","volume":"35","author":"Zhang Chi","year":"2022","unstructured":"Chi Zhang, Wei Yin, Billzb Wang, Gang Yu, Bin Fu, and Chunhua Shen. 2022. Hierarchical normalization for robust monocular depth estimation. Advances in Neural Information Processing Systems, Vol. 35 (2022), 14128--14139.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","volume-title":"Monocular depth estimation using diffusion models. arXiv preprint arXiv:2302.14816","author":"Saxena Saurabh","year":"2023","unstructured":"Saurabh Saxena, Abhishek Kar, Mohammad Norouzi, and David J Fleet. 2023. Monocular depth estimation using diffusion models. arXiv preprint arXiv:2302.14816 (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00527"},{"key":"e_1_3_2_1_20_1","volume-title":"Matthias M\u00fcller, and Peter Wonka.","author":"Lavreniuk Mykola","year":"2023","unstructured":"Mykola Lavreniuk, Shariq Farooq Bhat, Matthias M\u00fcller, and Peter Wonka. 2023. EVP: Enhanced Visual Perception using Inverse Multi-Attentive Feature Refinement and Regularized Image-Text Alignment. arXiv preprint arXiv:2312.08548 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Diffusion models trained with large data are transferable visual models. arXiv e-prints","author":"Xu Guangkai","year":"2024","unstructured":"Guangkai Xu, Yongtao Ge, Mingyu Liu, Chengxiang Fan, Kangyang Xie, Zhiyue Zhao, Hao Chen, and Chunhua Shen. 2024. Diffusion models trained with large data are transferable visual models. arXiv e-prints (2024), arXiv--2403."},{"key":"e_1_3_2_1_22_1","volume-title":"Diversedepth: Affine-invariant depth prediction using diverse data. arXiv preprint arXiv:2002.00569","author":"Yin Wei","year":"2020","unstructured":"Wei Yin, Xinlong Wang, Chunhua Shen, Yifan Liu, Zhi Tian, Songcen Xu, Changming Sun, and Dou Renyin. 2020. Diversedepth: Affine-invariant depth prediction using diverse data. arXiv preprint arXiv:2002.00569 (2020)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01061"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00830"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01073"},{"key":"e_1_3_2_1_26_1","volume-title":"Virtual kitti 2. arXiv preprint arXiv:2001.10773","author":"Cabon Yohann","year":"2020","unstructured":"Yohann Cabon, Naila Murray, and Martin Humenberger. 2020. Virtual kitti 2. arXiv preprint arXiv:2001.10773 (2020)."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings, Part V 12","author":"Silberman Nathan","year":"2012","unstructured":"Nathan Silberman, Derek Hoiem, Pushmeet Kohli, and Rob Fergus. 2012. Indoor segmentation and support inference from rgbd images. In Computer Vision--ECCV 2012: 12th European Conference on Computer Vision, Florence, Italy, October 7--13, 2012, Proceedings, Part V 12. Springer, 746--760."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913491297"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.272"},{"key":"e_1_3_2_1_31_1","volume-title":"Diode: A dense indoor and outdoor depth dataset. arXiv preprint arXiv:1908.00463","author":"Vasiljevic Igor","year":"2019","unstructured":"Igor Vasiljevic, Nick Kolkin, Shanyi Zhang, Ruotian Luo, Haochen Wang, Falcon Z Dai, Andrea F Daniele, Mohammadreza Mostajabi, Steven Basart, Matthew R Walter, et al. 2019. Diode: A dense indoor and outdoor depth dataset. arXiv preprint arXiv:1908.00463 (2019)."},{"key":"e_1_3_2_1_32_1","volume-title":"Latent consistency models: Synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378","author":"Luo Simian","year":"2023","unstructured":"Simian Luo, Yiqin Tan, Longbo Huang, Jian Li, and Hang Zhao. 2023. Latent consistency models: Synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378 (2023)."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733319","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:08:54Z","timestamp":1755749334000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733319"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":32,"alternative-id":["10.1145\/3731715.3733319","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733319","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}