{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T09:23:58Z","timestamp":1780392238735,"version":"3.54.1"},"publisher-location":"Cham","reference-count":113,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733369","type":"print"},{"value":"9783031733376","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73337-6_14","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T23:02:27Z","timestamp":1730329347000},"page":"236-257","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["Diffusion Models for\u00a0Monocular Depth Estimation: Overcoming Challenging Conditions"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6276-5282","authenticated-orcid":false,"given":"Fabio","family":"Tosi","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7734-5064","authenticated-orcid":false,"given":"Pierluigi Zama","family":"Ramirez","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3337-2236","authenticated-orcid":false,"given":"Matteo","family":"Poggi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"14_CR1","unstructured":"Stable diffusion v1.5 model card (2022). https:\/\/huggingface.co\/runwayml\/stable-diffusion-v1-5"},{"key":"14_CR2","unstructured":"Stable diffusion xl - sdxl 1.0 model card (2023). https:\/\/huggingface.co\/stabilityai\/stable-diffusion-xl-base-1.0"},{"key":"14_CR3","unstructured":"Alembics: Disco diffusion (2022). https:\/\/github.com\/alembics\/disco-diffusion"},{"key":"14_CR4","doi-asserted-by":"crossref","unstructured":"Aleotti, F., Tosi, F., Poggi, M., Mattoccia, S.: Generative adversarial networks for unsupervised monocular depth prediction. In: Proceedings of the European Conference on Computer Vision (ECCV) Workshops (2018)","DOI":"10.1007\/978-3-030-11009-3_20"},{"key":"14_CR5","doi-asserted-by":"crossref","unstructured":"Avrahami, O., et al.: Spatext: spatio-textual representation for controllable image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18370\u201318380 (2023)","DOI":"10.1109\/CVPR52729.2023.01762"},{"key":"14_CR6","doi-asserted-by":"crossref","unstructured":"Avrahami, O., Lischinski, D., Fried, O.: Blended diffusion for text-driven editing of natural images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18208\u201318218 (2022)","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"14_CR7","unstructured":"Bar-Tal, O., Yariv, L., Lipman, Y., Dekel, T.: Multidiffusion: fusing diffusion paths for controlled image generation (2023)"},{"key":"14_CR8","doi-asserted-by":"crossref","unstructured":"Bashkirova, D., Lezama, J., Sohn, K., Saenko, K., Essa, I.: Masksketch: unpaired structure-guided masked image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1879\u20131889 (2023)","DOI":"10.1109\/CVPR52729.2023.00187"},{"key":"14_CR9","unstructured":"Bhat, S.F., Alhashim, I., Wonka, P.: Adabins: depth estimation using adaptive bins. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4009\u20134018 (2021)"},{"key":"14_CR10","unstructured":"Bhat, S.F., Birkl, R., Wofk, D., Wonka, P., M\u00fcller, M.: Zoedepth: zero-shot transfer by combining relative and metric depth. arXiv preprint arXiv:2302.12288 (2023)"},{"key":"14_CR11","unstructured":"Bian, J., Li, Z., Wang, N., Zhan, H., Shen, C., Cheng, M.M., Reid, I.: Unsupervised scale-consistent depth and ego-motion learning from monocular video. Advances in neural information processing systems 32 (2019)"},{"key":"14_CR12","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: Instructpix2pix: Learning to follow image editing instructions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18392\u201318402 (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"14_CR13","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuscenes: a multimodal dataset for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11621\u201311631 (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"14_CR14","doi-asserted-by":"crossref","unstructured":"Casser, V., Pirk, S., Mahjourian, R., Angelova, A.: Depth prediction without the sensors: leveraging structure for unsupervised learning from monocular videos. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 8001\u20138008 (2019)","DOI":"10.1609\/aaai.v33i01.33018001"},{"key":"14_CR15","doi-asserted-by":"publisher","first-page":"1583","DOI":"10.1007\/s13042-020-01251-y","volume":"12","author":"Y Chen","year":"2021","unstructured":"Chen, Y., Zhao, H., Hu, Z., Peng, J.: Attention-based context aggregation network for monocular depth estimation. Int. J. Mach. Learn. Cybern. 12, 1583\u20131596 (2021)","journal-title":"Int. J. Mach. Learn. Cybern."},{"key":"14_CR16","doi-asserted-by":"crossref","unstructured":"Choi, H., Lee, H., Kim, S., Kim, S., Kim, S., Sohn, K., Min, D.: Adaptive confidence thresholding for monocular depth estimation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12808\u201312818 (2021)","DOI":"10.1109\/ICCV48922.2021.01257"},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proc. of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"14_CR18","doi-asserted-by":"crossref","unstructured":"Costanzino, A., Zama\u00a0Ramirez, P., Poggi, M., Tosi, F., Mattoccia, S., Di\u00a0Stefano, L.: Learning depth estimation for transparent and mirror surfaces. In: The IEEE International Conference on Computer Vision (2023), iCCV","DOI":"10.1109\/ICCV51070.2023.00848"},{"issue":"1","key":"14_CR19","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1109\/MSP.2017.2765202","volume":"35","author":"A Creswell","year":"2018","unstructured":"Creswell, A., White, T., Dumoulin, V., Arulkumaran, K., Sengupta, B., Bharath, A.A.: Generative adversarial networks: an overview. IEEE Signal Process. Mag. 35(1), 53\u201365 (2018)","journal-title":"IEEE Signal Process. Mag."},{"key":"14_CR20","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat gans on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"14_CR21","doi-asserted-by":"crossref","unstructured":"Eftekhar, A., Sax, A., Malik, J., Zamir, A.: Omnidata: a scalable pipeline for making multi-task mid-level vision datasets from 3d scans. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10786\u201310796 (2021)","DOI":"10.1109\/ICCV48922.2021.01061"},{"key":"14_CR22","doi-asserted-by":"crossref","unstructured":"Eigen, D., Fergus, R.: Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2650\u20132658 (2015)","DOI":"10.1109\/ICCV.2015.304"},{"key":"14_CR23","unstructured":"Eigen, D., Puhrsch, C., Fergus, R.: Depth map prediction from a single image using a multi-scale deep network. Advances in neural information processing systems 27 (2014)"},{"key":"14_CR24","doi-asserted-by":"publisher","unstructured":"Gafni, O., Polyak, A., Ashual, O., Sheynin, S., Parikh, D., Taigman, Y.: Make-a-scene: scene-based text-to-image generation with human priors. In: European Conference on Computer Vision, pp. 89\u2013106. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19784-0_6","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"14_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-46484-8_45","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Garg","year":"2016","unstructured":"Garg, R., B.G., V.K., Carneiro, G., Reid, I.: Unsupervised CNN for single view depth estimation: geometry to the rescue. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 740\u2013756. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_45"},{"key":"14_CR26","doi-asserted-by":"crossref","unstructured":"Gasperini, S., Koch, P., Dallabetta, V., Navab, N., Busam, B., Tombari, F.: R4dyn: exploring radar for self-supervised monocular depth estimation of dynamic scenes. In: 2021 International Conference on 3D Vision (3DV), pp. 751\u2013760. IEEE (2021)","DOI":"10.1109\/3DV53792.2021.00084"},{"key":"14_CR27","doi-asserted-by":"crossref","unstructured":"Gasperini, S., Morbitzer, N., Jung, H., Navab, N., Tombari, F.: Robust monocular depth estimation under challenging conditions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.00751"},{"key":"14_CR28","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? the kitti vision benchmark suite. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"14_CR29","doi-asserted-by":"crossref","unstructured":"Godard, C., Mac Aodha, O., Brostow, G.J.: Unsupervised monocular depth estimation with left-right consistency. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.699"},{"key":"14_CR30","doi-asserted-by":"crossref","unstructured":"Godard, C., Mac Aodha, O., Firman, M., Brostow, G.J.: Digging into self-supervised monocular depth prediction. In: The International Conference on Computer Vision (ICCV), October 2019","DOI":"10.1109\/ICCV.2019.00393"},{"key":"14_CR31","doi-asserted-by":"crossref","unstructured":"Gordon, A., Li, H., Jonschkowski, R., Angelova, A.: Depth from videos in the wild: Unsupervised monocular depth learning from unknown cameras. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8977\u20138986 (2019)","DOI":"10.1109\/ICCV.2019.00907"},{"key":"14_CR32","doi-asserted-by":"crossref","unstructured":"Guizilini, V., Ambrus, R., Pillai, S., Raventos, A., Gaidon, A.: 3d packing for self-supervised monocular depth estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2485\u20132494 (2020)","DOI":"10.1109\/CVPR42600.2020.00256"},{"key":"14_CR33","doi-asserted-by":"crossref","unstructured":"Guizilini, V., Hou, R., Li, J., Ambrus, R., Gaidon, A.: Semantically-guided representation learning for self-supervised monocular depth. arXiv preprint arXiv:2002.12319 (2020)","DOI":"10.1109\/CVPR42600.2020.00256"},{"key":"14_CR34","doi-asserted-by":"crossref","unstructured":"Guizilini, V., Vasiljevic, I., Chen, D., Ambru\u015f, R., Gaidon, A.: Towards zero-shot scale-aware monocular depth estimation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9233\u20139243 (2023)","DOI":"10.1109\/ICCV51070.2023.00847"},{"key":"14_CR35","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)"},{"key":"14_CR36","doi-asserted-by":"crossref","unstructured":"Hoiem, D., Efros, A.A., Hebert, M.: Automatic photo pop-up. In: ACM SIGGRAPH 2005 Papers, pp. 577\u2013584 (2005)","DOI":"10.1145\/1186822.1073232"},{"key":"14_CR37","doi-asserted-by":"publisher","unstructured":"Hornauer, J., Belagiannis, V.: Gradient-based uncertainty for monocular depth estimation. In: European Conference on Computer Vision, pp. 613\u2013630. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20044-1_35","DOI":"10.1007\/978-3-031-20044-1_35"},{"key":"14_CR38","unstructured":"Hu, M., et al.: Cocktail: mixing multi-modality control for text-conditional image generation. In: Thirty-seventh Conference on Neural Information Processing Systems (2023)"},{"key":"14_CR39","unstructured":"Huang, L., Chen, D., Liu, Y., Shen, Y., Zhao, D., Zhou, J.: Composer: creative and controllable image synthesis with composable conditions. arXiv preprint arXiv:2302.09778 (2023)"},{"issue":"10","key":"14_CR40","doi-asserted-by":"publisher","first-page":"2702","DOI":"10.1109\/TPAMI.2019.2926463","volume":"42","author":"X Huang","year":"2019","unstructured":"Huang, X., Wang, P., Cheng, X., Zhou, D., Geng, Q., Yang, R.: The apolloscape open dataset for autonomous driving and its application. IEEE Trans. Pattern Anal. Mach. Intell. 42(10), 2702\u20132719 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR41","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6007\u20136017 (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"14_CR42","doi-asserted-by":"crossref","unstructured":"Ke, B., Obukhov, A., Huang, S., Metzger, N., Daudt, R.C., Schindler, K.: Repurposing diffusion-based image generators for monocular depth estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9492\u20139502 (2024)","DOI":"10.1109\/CVPR52733.2024.00907"},{"key":"14_CR43","doi-asserted-by":"crossref","unstructured":"Kim, G., Kwon, T., Ye, J.C.: Diffusionclip: text-guided diffusion models for robust image manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2426\u20132435 (2022)","DOI":"10.1109\/CVPR52688.2022.00246"},{"key":"14_CR44","first-page":"21696","volume":"34","author":"D Kingma","year":"2021","unstructured":"Kingma, D., Salimans, T., Poole, B., Ho, J.: Variational diffusion models. Adv. Neural. Inf. Process. Syst. 34, 21696\u201321707 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"14_CR45","doi-asserted-by":"crossref","unstructured":"Klingner, M., Term\u00f6hlen, J.A., Mikolajczyk, J., Fingscheidt, T.: Self-supervised monocular depth estimation: Solving the dynamic object problem by semantic guidance. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XX 16, pp. 582\u2013600. Springer (2020)","DOI":"10.1007\/978-3-030-58565-5_35"},{"key":"14_CR46","unstructured":"Lee, J.H., Han, M.K., Ko, D.W., Suh, I.H.: From big to small: multi-scale local planar guidance for monocular depth estimation. arXiv preprint arXiv:1907.10326 (2019)"},{"key":"14_CR47","doi-asserted-by":"crossref","unstructured":"Li, B., Shen, C., Dai, Y., Van Den\u00a0Hengel, A., He, M.: Depth and surface normal estimation from monocular images using regression on deep features and hierarchical crfs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1119\u20131127 (2015)","DOI":"10.1109\/CVPR.2015.7298715"},{"issue":"10","key":"14_CR48","doi-asserted-by":"publisher","first-page":"2024","DOI":"10.1109\/TPAMI.2015.2505283","volume":"38","author":"F Liu","year":"2015","unstructured":"Liu, F., Shen, C., Lin, G., Reid, I.: Learning depth from single monocular images using deep convolutional neural fields. IEEE Trans. Pattern Anal. Mach. Intell. 38(10), 2024\u20132039 (2015)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR49","doi-asserted-by":"crossref","unstructured":"Liu, L., Song, X., Wang, M., Liu, Y., Zhang, L.: Self-supervised monocular depth estimation for all day images using domain separation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12737\u201312746 (2021)","DOI":"10.1109\/ICCV48922.2021.01250"},{"key":"14_CR50","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"14_CR51","doi-asserted-by":"crossref","unstructured":"Luo, Y., et al.: Single view stereo matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 155\u2013163 (2018)","DOI":"10.1109\/CVPR.2018.00024"},{"issue":"1","key":"14_CR52","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1177\/0278364916679498","volume":"36","author":"W Maddern","year":"2017","unstructured":"Maddern, W., Pascoe, G., Linegar, C., Newman, P.: 1 year, 1000 km: the Oxford robotcar dataset. Int. J. Robot. Res. 36(1), 3\u201315 (2017)","journal-title":"Int. J. Robot. Res."},{"key":"14_CR53","doi-asserted-by":"crossref","unstructured":"Mahjourian, R., Wicke, M., Angelova, A.: Unsupervised learning of depth and ego-motion from monocular video using 3d geometric constraints. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5667\u20135675 (2018)","DOI":"10.1109\/CVPR.2018.00594"},{"key":"14_CR54","doi-asserted-by":"crossref","unstructured":"Menze, M., Geiger, A.: Object scene flow for autonomous vehicles. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298925"},{"key":"14_CR55","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2i-adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 4296\u20134304 (2024)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"14_CR56","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2i-adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"14_CR57","doi-asserted-by":"crossref","unstructured":"Neuhold, G., Ollmann, T., Rota\u00a0Bulo, S., Kontschieder, P.: The mapillary vistas dataset for semantic understanding of street scenes. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4990\u20134999 (2017)","DOI":"10.1109\/ICCV.2017.534"},{"key":"14_CR58","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"14_CR59","unstructured":"OpenAI: Dall-e 2 (2023). https:\/\/openai.com\/product\/dall-e-2"},{"key":"14_CR60","doi-asserted-by":"crossref","unstructured":"Parmar, G., Kumar\u00a0Singh, K., Zhang, R., Li, Y., Lu, J., Zhu, J.Y.: Zero-shot image-to-image translation. In: ACM SIGGRAPH 2023 Conference Proceedings, pp. 1\u201311 (2023)","DOI":"10.1145\/3588432.3591513"},{"key":"14_CR61","doi-asserted-by":"crossref","unstructured":"Patil, V., Sakaridis, C., Liniger, A., Van\u00a0Gool, L.: P3depth: monocular depth estimation with a piecewise planarity prior. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1610\u20131621 (2022)","DOI":"10.1109\/CVPR52688.2022.00166"},{"key":"14_CR62","doi-asserted-by":"crossref","unstructured":"Peng, R., Wang, R., Lai, Y., Tang, L., Cai, Y.: Excavating the potential capacity of self-supervised monocular depth estimation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15560\u201315569 (2021)","DOI":"10.1109\/ICCV48922.2021.01527"},{"key":"14_CR63","doi-asserted-by":"crossref","unstructured":"Pilzer, A., Xu, D., Puscas, M., Ricci, E., Sebe, N.: Unsupervised adversarial depth estimation using cycled generative networks. In: 2018 International Conference on 3D Vision (3DV), pp. 587\u2013595. IEEE (2018)","DOI":"10.1109\/3DV.2018.00073"},{"key":"14_CR64","doi-asserted-by":"crossref","unstructured":"Poggi, M., Aleotti, F., Tosi, F., Mattoccia, S.: On the uncertainty of self-supervised monocular depth estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3227\u20133237 (2020)","DOI":"10.1109\/CVPR42600.2020.00329"},{"key":"14_CR65","doi-asserted-by":"crossref","unstructured":"Poggi, M., Tosi, F., Mattoccia, S.: Learning monocular depth estimation with unsupervised trinocular assumptions. In: 2018 International Conference on 3d Vision (3DV), pp. 324\u2013333. IEEE (2018)","DOI":"10.1109\/3DV.2018.00045"},{"key":"14_CR66","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 1(2), 3 (2022)"},{"key":"14_CR67","doi-asserted-by":"crossref","unstructured":"Ramirez, P.Z., et al.: Booster: a benchmark for depth from images of specular and transparent surfaces. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3323858"},{"key":"14_CR68","unstructured":"Ramirez, P.Z., et al.: Ntire 2024 challenge on hr depth from images of specular and transparent surfaces. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops, pp. 6499\u20136512, June 2024"},{"key":"14_CR69","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Bochkovskiy, A., Koltun, V.: Vision transformers for dense prediction. ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"14_CR70","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Lasinger, K., Hafner, D., Schindler, K., Koltun, V.: Towards robust monocular depth estimation: mixing datasets for zero-shot cross-dataset transfer. IEEE Trans. Pattern Anal. Mach. Intell. 44(3) (2022)","DOI":"10.1109\/TPAMI.2020.3019967"},{"key":"14_CR71","doi-asserted-by":"crossref","unstructured":"Ranjan, A., et al.: Competitive collaboration: Joint unsupervised learning of depth, camera motion, optical flow and motion segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12240\u201312249 (2019)","DOI":"10.1109\/CVPR.2019.01252"},{"key":"14_CR72","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"14_CR73","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"14_CR74","doi-asserted-by":"crossref","unstructured":"Sajjan, S., et al.: Clear grasp: 3d shape estimation of transparent objects for manipulation. In: 2020 IEEE International Conference on Robotics and Automation (ICRA), pp. 3634\u20133642. IEEE (2020)","DOI":"10.1109\/ICRA40945.2020.9197518"},{"key":"14_CR75","unstructured":"Saxena, A., Chung, S., Ng, A.: Learning depth from single monocular images. Advances in neural information processing systems 18 (2005)"},{"issue":"5","key":"14_CR76","doi-asserted-by":"publisher","first-page":"824","DOI":"10.1109\/TPAMI.2008.132","volume":"31","author":"A Saxena","year":"2008","unstructured":"Saxena, A., Sun, M., Ng, A.Y.: Make3d: learning 3d scene structure from a single still image. IEEE Trans. Pattern Anal. Mach. Intell. 31(5), 824\u2013840 (2008)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR77","unstructured":"Saxena, S., Herrmann, C., Hur, J., Kar, A., Norouzi, M., Sun, D., Fleet, D.J.: The surprising effectiveness of diffusion models for optical flow and monocular depth estimation. arXiv preprint arXiv:2306.01923 (2023)"},{"key":"14_CR78","unstructured":"Saxena, S., Kar, A., Norouzi, M., Fleet, D.J.: Monocular depth estimation using diffusion models. arXiv preprint arXiv:2302.14816 (2023)"},{"key":"14_CR79","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"14_CR80","doi-asserted-by":"crossref","unstructured":"Spencer, J., Bowden, R., Hadfield, S.: Defeat-net: general monocular depth via simultaneous unsupervised representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14402\u201314413 (2020)","DOI":"10.1109\/CVPR42600.2020.01441"},{"key":"14_CR81","doi-asserted-by":"crossref","unstructured":"Spencer, J., et\u00a0al.: The monocular depth estimation challenge. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 623\u2013632 (2023)","DOI":"10.1109\/WACVW58289.2023.00069"},{"key":"14_CR82","unstructured":"Spencer, J., et\u00a0al.: The second monocular depth estimation challenge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3064\u20133076 (2023)"},{"key":"14_CR83","unstructured":"Spencer, J., et\u00a0al.: The third monocular depth estimation challenge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1\u201314 (2024)"},{"issue":"5","key":"14_CR84","doi-asserted-by":"publisher","first-page":"2023","DOI":"10.1109\/TNNLS.2021.3100895","volume":"33","author":"Q Sun","year":"2021","unstructured":"Sun, Q., Tang, Y., Zhang, C., Zhao, C., Qian, F., Kurths, J.: Unsupervised estimation of monocular depth and vo in dynamic environments via hybrid masks. IEEE Trans. Neural Networks Learn. Syst. 33(5), 2023\u20132033 (2021)","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"key":"14_CR85","doi-asserted-by":"crossref","unstructured":"Tosi, F., Aleotti, F., Poggi, M., Mattoccia, S.: Learning monocular depth estimation infusing traditional stereo knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9799\u20139809 (2019)","DOI":"10.1109\/CVPR.2019.01003"},{"key":"14_CR86","doi-asserted-by":"crossref","unstructured":"Tosi, F., et al.: Distilled semantics for comprehensive scene understanding from videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4654\u20134665 (2020)","DOI":"10.1109\/CVPR42600.2020.00471"},{"key":"14_CR87","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"443","DOI":"10.1007\/978-3-030-58604-1_27","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Vankadari","year":"2020","unstructured":"Vankadari, M., Garg, S., Majumder, A., Kumar, S., Behera, A.: Unsupervised monocular depth estimation for night-time images using adversarial domain feature adaptation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12373, pp. 443\u2013459. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58604-1_27"},{"key":"14_CR88","unstructured":"Vankadari, M., Golodetz, S., Garg, S., Shin, S., Markham, A., Trigoni, N.: When the sun goes down: repairing photometric losses for all-day depth estimation. In: Conference on Robot Learning, pp. 1992\u20132003. PMLR (2023)"},{"key":"14_CR89","doi-asserted-by":"crossref","unstructured":"Voynov, A., Aberman, K., Cohen-Or, D.: Sketch-guided text-to-image diffusion models. In: ACM SIGGRAPH 2023 Conference Proceedings, pp. 1\u201311 (2023)","DOI":"10.1145\/3588432.3591560"},{"key":"14_CR90","doi-asserted-by":"crossref","unstructured":"Wang, C., Buenaposada, J.M., Zhu, R., Lucey, S.: Learning depth from monocular videos using direct methods. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2022\u20132030 (2018)","DOI":"10.1109\/CVPR.2018.00216"},{"key":"14_CR91","doi-asserted-by":"crossref","unstructured":"Wang, K., et al.: Regularizing nighttime weirdness: efficient self-supervised monocular depth estimation in the dark. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16055\u201316064 (2021)","DOI":"10.1109\/ICCV48922.2021.01575"},{"key":"14_CR92","doi-asserted-by":"crossref","unstructured":"Watson, J., Firman, M., Brostow, G.J., Turmukhambetov, D.: Self-supervised monocular depth hints. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2162\u20132171 (2019)","DOI":"10.1109\/ICCV.2019.00225"},{"key":"14_CR93","doi-asserted-by":"crossref","unstructured":"Wu, C.Y., Wang, J., Hall, M., Neumann, U., Su, S.: Toward practical monocular indoor depth estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3814\u20133824 (2022)","DOI":"10.1109\/CVPR52688.2022.00379"},{"key":"14_CR94","doi-asserted-by":"crossref","unstructured":"Xie, E., Wang, W., Wang, W., Ding, M., Shen, C., Luo, P.: Segmenting transparent objects in the wild. arXiv preprint arXiv:2003.13948 (2020)","DOI":"10.1007\/978-3-030-58601-0_41"},{"key":"14_CR95","doi-asserted-by":"crossref","unstructured":"Yang, G., Song, X., Huang, C., Deng, Z., Shi, J., Zhou, B.: Drivingstereo: a large-scale dataset for stereo matching in autonomous driving scenarios. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 899\u2013908 (2019)","DOI":"10.1109\/CVPR.2019.00099"},{"key":"14_CR96","doi-asserted-by":"crossref","unstructured":"Yang, L., Kang, B., Huang, Z., Xu, X., Feng, J., Zhao, H.: Depth anything: unleashing the power of large-scale unlabeled data. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10371\u201310381 (2024)","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"14_CR97","doi-asserted-by":"crossref","unstructured":"Yang, X., Mei, H., Xu, K., Wei, X., Yin, B., Lau, R.W.: Where is my mirror? In: The IEEE International Conference on Computer Vision (ICCV), October 2019","DOI":"10.1109\/ICCV.2019.00890"},{"key":"14_CR98","doi-asserted-by":"crossref","unstructured":"Yin, W., Liu, Y., Shen, C., Yan, Y.: Enforcing geometric constraints of virtual normal for depth prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5684\u20135693 (2019)","DOI":"10.1109\/ICCV.2019.00578"},{"key":"14_CR99","doi-asserted-by":"crossref","unstructured":"Yin, W., Zhang, C., Chen, H., Cai, Z., Yu, G., Wang, K., Chen, X., Shen, C.: Metric3d: Towards zero-shot metric 3d prediction from a single image. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 9043\u20139053 (2023)","DOI":"10.1109\/ICCV51070.2023.00830"},{"key":"14_CR100","doi-asserted-by":"crossref","unstructured":"Yin, Z., Shi, J.: Geonet: unsupervised learning of dense depth, optical flow and camera pose. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1983\u20131992 (2018)","DOI":"10.1109\/CVPR.2018.00212"},{"key":"14_CR101","doi-asserted-by":"crossref","unstructured":"Yuan, W., Gu, X., Dai, Z., Zhu, S., Tan, P.: Neural window fully-connected crfs for monocular depth estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3916\u20133925 (2022)","DOI":"10.1109\/CVPR52688.2022.00389"},{"key":"14_CR102","doi-asserted-by":"crossref","unstructured":"Zama\u00a0Ramirez, P., et al.: NTIRE 2023 challenge on HR depth from images of specular and transparent surfaces. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (2023)","DOI":"10.1109\/CVPRW59228.2023.00143"},{"key":"14_CR103","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"298","DOI":"10.1007\/978-3-030-20893-6_19","volume-title":"Computer Vision \u2013 ACCV 2018","author":"P Zama Ramirez","year":"2019","unstructured":"Zama Ramirez, P., Poggi, M., Tosi, F., Mattoccia, S., Di Stefano, L.: Geometry meets semantics for semi-supervised monocular depth estimation. In: Jawahar, C.V., Li, H., Mori, G., Schindler, K. (eds.) ACCV 2018. LNCS, vol. 11363, pp. 298\u2013313. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-20893-6_19"},{"key":"14_CR104","doi-asserted-by":"crossref","unstructured":"Zama\u00a0Ramirez, P., Tosi, F., Poggi, M., Salti, S., Di\u00a0Stefano, L., Mattoccia, S.: Open challenges in deep stereo: the booster dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022), cVPR","DOI":"10.1109\/CVPR52688.2022.02049"},{"key":"14_CR105","unstructured":"Zavadski, D., Feiden, J.F., Rother, C.: Controlnet-xs: designing an efficient and effective architecture for controlling text-to-image diffusion models. arXiv preprint arXiv:2312.06573 (2023)"},{"key":"14_CR106","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"issue":"9","key":"14_CR107","doi-asserted-by":"publisher","first-page":"1612","DOI":"10.1007\/s11431-020-1582-8","volume":"63","author":"C Zhao","year":"2020","unstructured":"Zhao, C., Sun, Q., Zhang, C., Tang, Y., Qian, F.: Monocular depth estimation based on deep learning: an overview. Sci. China Technol. Sci. 63(9), 1612\u20131627 (2020)","journal-title":"Sci. China Technol. Sci."},{"issue":"5","key":"14_CR108","doi-asserted-by":"publisher","first-page":"1237","DOI":"10.1109\/TETCI.2022.3182360","volume":"6","author":"C Zhao","year":"2022","unstructured":"Zhao, C., Tang, Y., Sun, Q.: Unsupervised monocular depth estimation in highly complex environments. IEEE Trans. Emerging Topics Comput. Intell. 6(5), 1237\u20131246 (2022)","journal-title":"IEEE Trans. Emerging Topics Comput. Intell."},{"key":"14_CR109","doi-asserted-by":"crossref","unstructured":"Zhao, C., et al.: Monovit: self-supervised monocular depth estimation with a vision transformer. arXiv preprint arXiv:2208.03543 (2022)","DOI":"10.1109\/3DV57658.2022.00077"},{"key":"14_CR110","unstructured":"Zhao, S., et al.: Uni-controlnet: all-in-one control to text-to-image diffusion models. Advances in Neural Information Processing Systems 36 (2024)"},{"key":"14_CR111","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1007\/978-3-030-58580-8_10","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Zheng","year":"2020","unstructured":"Zheng, Z., Wu, Y., Han, X., Shi, J.: ForkGAN: seeing into the rainy night. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 155\u2013170. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_10"},{"key":"14_CR112","doi-asserted-by":"crossref","unstructured":"Zhou, T., Brown, M., Snavely, N., Lowe, D.G.: Unsupervised learning of depth and ego-motion from video. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1851\u20131858 (2017)","DOI":"10.1109\/CVPR.2017.700"},{"key":"14_CR113","doi-asserted-by":"crossref","unstructured":"Zou, Y., Luo, Z., Huang, J.B.: Df-net: unsupervised joint learning of depth and flow using cross-task consistency. In: Proceedings of the European conference on computer vision (ECCV), pp. 36\u201353 (2018)","DOI":"10.1007\/978-3-030-01228-1_3"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73337-6_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T23:05:54Z","timestamp":1730329554000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73337-6_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031733369","9783031733376"],"references-count":113,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73337-6_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}