{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:19:19Z","timestamp":1765340359690,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"name":"Beijing Natural Science Foundation","award":["L252018"],"award-info":[{"award-number":["L252018"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62402054, 61772061"],"award-info":[{"award-number":["62402054, 61772061"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"China Postdoctoral Science Foundation","award":["2024M760279,BX20250390"],"award-info":[{"award-number":["2024M760279,BX20250390"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755409","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"4378-4387","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["OGDepth: Leveraging Object Guidance in Diffusion Models for Enhanced Monocular Depth Estimation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-7385-2894","authenticated-orcid":false,"given":"Wenzheng","family":"Yang","sequence":"first","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7926-5727","authenticated-orcid":false,"given":"Songwei","family":"Pei","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5596-3636","authenticated-orcid":false,"given":"Bingfeng","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1612-4644","authenticated-orcid":false,"given":"Qian","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7245-1298","authenticated-orcid":false,"given":"Shangguang","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897187"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00581"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00581"},{"key":"e_1_3_2_1_4_1","volume-title":"ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth. arXiv","author":"Bhat Shariq Farooq","year":"2023","unstructured":"Shariq Farooq Bhat, Reiner Birkl, Diana Wofk, Peter Wonka, and Matthias Muller. 2023. ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth. arXiv, Vol. 2302.12288 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257205739"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00273"},{"volume-title":"Proceedings of the 10th European Conference Computer Vision, Ale\u0161 Leonardis, Elisa Ricci, Stefan Roth, Olga Russakovsky, Torsten Sattler, and G\u00fcl Varol (Eds.)","author":"Duan Yiquan","key":"e_1_3_2_1_6_1","unstructured":"Yiquan Duan, Xianda Guo, and Zheng Zhu. 2025. DiffusionDepth: Diffusion Denoising Approach for\u00a0Monocular Depth Estimation. In Proceedings of the 10th European Conference Computer Vision, Ale\u0161 Leonardis, Elisa Ricci, Stefan Roth, Olga Russakovsky, Torsten Sattler, and G\u00fcl Varol (Eds.). Springer Nature Switzerland, Cham, 432-449."},{"key":"e_1_3_2_1_7_1","volume-title":"Depth map prediction from a single image using a multi-scale deep network. Advances in neural information processing systems","author":"Eigen David","year":"2014","unstructured":"David Eigen, Christian Puhrsch, and Rob Fergus. 2014. Depth map prediction from a single image using a multi-scale deep network. Advances in neural information processing systems, Vol. 27 (2014)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00400"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00214"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_45"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913491297"},{"key":"e_1_3_2_1_12_1","unstructured":"Jocher Glenn Chaurasia Ayush and Qiu Jing. 2023. Ultralytics YOLOv8. https:\/\/github.com\/ultralytics\/ultralytics"},{"key":"e_1_3_2_1_13_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv (2022)."},{"key":"e_1_3_2_1_14_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840-6851."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01987"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00907"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01317"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.121352"},{"key":"e_1_3_2_1_20_1","volume-title":"Matthias M\u00fcller, and Peter Wonka.","author":"Lavreniuk Mykola","year":"2023","unstructured":"Mykola Lavreniuk, Shariq Farooq Bhat, Matthias M\u00fcller, and Peter Wonka. 2023. EVP: Enhanced Visual Perception using Inverse Multi-Attentive Feature Refinement and Regularized Image-Text Alignment. arXiv (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Dong Wook Ko, and Il Hong Suh","author":"Lee Jin Han","year":"2019","unstructured":"Jin Han Lee, Myung-Kyu Han, Dong Wook Ko, and Il Hong Suh. 2019. From Big to Small: Multi-Scale Local Planar Guidance for Monocular Depth Estimation. arXiv, Vol. 1907.10326 (2019). https:\/\/api.semanticscholar.org\/CorpusID:198229801"},{"key":"e_1_3_2_1_22_1","unstructured":"Chuyi Li Lulu Li Yifei Geng Hongliang Jiang Meng Cheng Bo Zhang Zaidan Ke Xiaoming Xu and Xiangxiang Chu. [n.d.]. Yolov6 v3. 0: A full-scale reloading. arXiv ( [n. d.])."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML'23). JMLR.org, Article 814, 13 pages."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3416065"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.14711\/thesis-991013270957303412"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01822"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02672"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02057"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In Proceedings of the 38th International Conference on Machine Learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_32_1","volume-title":"Monocular Depth Estimation using Diffusion Models. arXiv","author":"Saurabh Saxena","year":"2023","unstructured":"Saxena Saurabh, Kar Abhishek, Norouzi Mohammad, and J. Fleet David. 2023. Monocular Depth Estimation using Diffusion Models. arXiv, Vol. 2302.14816 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257232830"},{"key":"e_1_3_2_1_33_1","volume-title":"Learning depth from single monocular images. Advances in neural information processing systems","author":"Saxena Ashutosh","year":"2005","unstructured":"Ashutosh Saxena, Sung Chung, and Andrew Ng. 2005. Learning depth from single monocular images. Advances in neural information processing systems, Vol. 18 (2005)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2007.4408828"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2008.132"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00729"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"e_1_3_2_1_38_1","first-page":"2256","volume-title":"Proceedings of the 32nd International Conference on International Conference on Machine Learning","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric A. Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In Proceedings of the 32nd International Conference on International Conference on Machine Learning (Lille, France) (ICML'15). JMLR.org, 2256-2265."},{"key":"e_1_3_2_1_39_1","volume-title":"Denoising diffusion implicit models. arXiv","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv (2020)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01168"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3209968"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00389"},{"key":"e_1_3_2_1_43_1","volume-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv","author":"Zhang Hao","year":"2022","unstructured":"Hao Zhang, Feng Li, Shilong Liu, Lei Zhang, Hang Su, Jun Zhu, Lionel M Ni, and Heung-Yeung Shum. 2022. Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv (2022)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00527"},{"key":"e_1_3_2_1_45_1","first-page":"487","volume-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems","author":"Zhou Bolei","year":"2014","unstructured":"Bolei Zhou, Agata Lapedriza, Jianxiong Xiao, Antonio Torralba, and Aude Oliva. 2014. Learning deep features for scene recognition using places database. In Proceedings of the 28th International Conference on Neural Information Processing Systems (Montreal, Canada) (NIPS'14). MIT Press, Cambridge, MA, USA, 487-495."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755409","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:14:37Z","timestamp":1765340077000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755409"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3755409","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755409","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}