{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T20:49:08Z","timestamp":1757623748904,"version":"3.44.0"},"publisher-location":"Cham","reference-count":29,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032014856"},{"type":"electronic","value":"9783032014863"}],"license":[{"start":{"date-parts":[[2025,8,13]],"date-time":"2025-08-13T00:00:00Z","timestamp":1755043200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,13]],"date-time":"2025-08-13T00:00:00Z","timestamp":1755043200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-01486-3_5","type":"book-chapter","created":{"date-parts":[[2025,8,19]],"date-time":"2025-08-19T15:47:04Z","timestamp":1755618424000},"page":"41-53","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Leveraging Stable Diffusion for\u00a0Monocular Depth Estimation via\u00a0Image Semantic Encoding"],"prefix":"10.1007","author":[{"given":"Jingming","family":"Xia","sequence":"first","affiliation":[]},{"given":"Guanqun","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Guang","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Yiben","family":"Luo","sequence":"additional","affiliation":[]},{"given":"Qinzhao","family":"Li","sequence":"additional","affiliation":[]},{"given":"John","family":"Oyekan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,13]]},"reference":[{"issue":"5","key":"5_CR1","doi-asserted-by":"publisher","first-page":"1318","DOI":"10.1109\/TCYB.2013.2265378","volume":"43","author":"J Han","year":"2013","unstructured":"Han, J., Shao, L., Xu, D., Shotton, J.: Enhanced computer vision with microsoft kinect sensor: a review. IEEE Trans. Cybern. 43(5), 1318\u20131334 (2013)","journal-title":"IEEE Trans. Cybern."},{"key":"5_CR2","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.neucom.2020.12.089","volume":"438","author":"Y Ming","year":"2021","unstructured":"Ming, Y., Meng, X., Fan, C., Yu, H.: Deep learning for monocular depth estimation: a Review. Neurocomputing 438, 14\u201333 (2021)","journal-title":"Neurocomputing"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Kumar, C.S., Bhandarkar, S.M., Prasad, M.: Monocular depth prediction using generative adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 300\u2013308 (2018)","DOI":"10.1109\/CVPRW.2018.00068"},{"key":"5_CR4","unstructured":"Eigen, D., Puhrsch, C., Fergus, R.: Depth map prediction from a single image using a multi-scale deep network. In: Proceedings of the 27th International Conference on Neural Information Processing Systems (NIPS), vol. 2, pp. 2366\u20132374 (2014)"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Jung, H., Kim, Y., Oh, C., Sohn, K.: Depth prediction from a single image with conditional adversarial networks. In: Proceedings of the IEEE International Conference on Image Processing (ICIP), Beijing, pp. 1717\u20131721 (2017)","DOI":"10.1109\/ICIP.2017.8296575"},{"key":"5_CR6","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: Proceedings of the 35th International Conference on Neural Information Processing Systems (NIPS), vol. 1, Article No. 672, pp. 8780\u20138794 (2024)"},{"key":"5_CR7","unstructured":"Saxena, S., Kar, A., Norouzi, M., Fleet, D.J.: Monocular depth estimation using diffusion models. arXiv preprint arXiv:2302.14816 (2023)"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Patni, S., Agarwal, A., Arora, C.: ECoDepth: effective conditioning of diffusion models for monocular depth estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 28285\u201328295 (2024)","DOI":"10.1109\/CVPR52733.2024.02672"},{"key":"5_CR9","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Proceedings of the 34th International Conference on Neural Information Processing Systems (NIPS), vol. 1, Article No. 574, pp. 6840\u20136851 (2020)"},{"key":"5_CR10","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-Resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310694 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"5_CR11","doi-asserted-by":"crossref","unstructured":"Zhao, W., Yongming, R., Liu, Z., Liu, B., Zhou, J., Lu, J.: Unleashing text-to-image diffusion models for visual perception. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5729\u20135739 (2023)","DOI":"10.1109\/ICCV51070.2023.00527"},{"key":"5_CR12","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Xu, X., Guo, J., Wang, Z., Huang, G., Essa, I., Shi, H.: Prompt-free diffusion: taking \u201ctext\u201d out of text-to-image diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8682\u20138692 (2024)","DOI":"10.1109\/CVPR52733.2024.00829"},{"issue":"11","key":"5_CR14","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1177\/0278364913491297","volume":"32","author":"A Geiger","year":"2013","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R.: Vision meets robotics: the KITTI dataset. Int. J. Robot. Res. 32(11), 1231\u20131237 (2013)","journal-title":"Int. J. Robot. Res."},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Sun, P., et al.: Scalability in perception for autonomous driving: waymo open dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2446\u20132454 (2020)","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Godard, C., Mac Aodha, O., Firman, M., Brostow, G.J.: Digging into self-supervised monocular depth prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), p. 2019 (2019)","DOI":"10.1109\/ICCV.2019.00393"},{"key":"5_CR17","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. In: Advances in Neural Information Processing Systems 27 (NIPS), pp. 2672\u20132680 (2014)"},{"key":"5_CR18","unstructured":"Karras, T., Aittala, M., Laine, S., H\u00e4rk\u00f6nen, E., Hellsten, J., Lehtinen, J., Aila, T.: Alias-free generative adversarial networks. In: Proceedings of the 35th International Conference on Neural Information Processing Systems (NeurIPS) (2021)"},{"key":"5_CR19","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of the 40th International Conference on Machine Learning (ICML), vol. 2023, Article No. 814, pp. 19730\u201319742 (2023)"},{"key":"5_CR20","unstructured":"A. Dosovitskiy et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2021)"},{"key":"5_CR21","unstructured":"C. Schuhmann et al.: Laion-400m: open dataset of CLIP-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"5_CR22","unstructured":"Byeon, M., Park, B., Kim, H., Lee, S., Baek, W., Kim, S.: Coyo-700 m: image-text pair dataset. https:\/\/github.com\/kakaobrain\/coyo-dataset, Accessed 26 Aug 2024 (2022)"},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9992\u201310002 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"5_CR24","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems (NIPS), pp. 6000\u20136010 (2017)"},{"key":"5_CR25","unstructured":"Paszke, A., et al.: PyTorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems (NeurIPS) (2019)"},{"key":"5_CR26","doi-asserted-by":"crossref","unstructured":"Garg, R., Kumar, B.G.V., Carneiro, G., Reid, I.: Unsupervised CNN for single view depth estimation: geometry to the rescue. In: European Conference on Computer Vision (ECCV), pp. 740\u2013756 (2016)","DOI":"10.1007\/978-3-319-46484-8_45"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Z. Tu, X. Chen, P. Ren, and Y. Wang: AdaBin: Improving Binary Neural Networks with Adaptive Binary Sets. In: Proceedings of European Conference on Computer Vision (ECCV), Cham, vol. 13671, pp. 379\u2013395 (2022)","DOI":"10.1007\/978-3-031-20083-0_23"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Bochkovskiy, A., Koltun, V.: Vision transformers for dense prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12179\u201312188 (2021)","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"5_CR29","unstructured":"Bhat, S.F., Birkl, R., Wofk, D., Wonka, P., M\u00fcller, M.: ZoeDepth: zero-shot transfer by combining relative and metric depth. arXiv preprint arXiv:2302.12288 (2023)"}],"container-title":["Lecture Notes in Computer Science","Towards Autonomous Robotic Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-01486-3_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T09:05:25Z","timestamp":1757408725000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-01486-3_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,13]]},"ISBN":["9783032014856","9783032014863"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-01486-3_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,8,13]]},"assertion":[{"value":"13 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TAROS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Annual Conference Towards Autonomous Robotic Systems","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"York","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"taros2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/taros-conference.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}