{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T05:24:16Z","timestamp":1778217856839,"version":"3.51.4"},"reference-count":80,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,10,23]],"date-time":"2023-10-23T00:00:00Z","timestamp":1698019200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,10,23]],"date-time":"2023-10-23T00:00:00Z","timestamp":1698019200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62271410"],"award-info":[{"award-number":["62271410"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2024,4]]},"DOI":"10.1007\/s11263-023-01915-6","type":"journal-article","created":{"date-parts":[[2023,10,23]],"date-time":"2023-10-23T08:02:10Z","timestamp":1698048130000},"page":"1012-1028","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Towards a Unified Network for Robust Monocular Depth Estimation: Network Architecture, Training Strategy and Dataset"],"prefix":"10.1007","volume":"132","author":[{"given":"Mochu","family":"Xiang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4432-7406","authenticated-orcid":false,"given":"Yuchao","family":"Dai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feiyu","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiawei","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinyu","family":"Tian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhensong","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,10,23]]},"reference":[{"issue":"19","key":"1915_CR1","doi-asserted-by":"publisher","first-page":"16423","DOI":"10.1007\/s00521-022-07663-x","volume":"34","author":"S Abdulwahab","year":"2022","unstructured":"Abdulwahab, S., Rashwan, H. A., Garcia, M. A., Masoumian, A., & Puig, D. (2022). Monocular depth map estimation based on a multi-scale deep architecture and curvilinear saliency feature boosting. Neural Computing and Applications, 34(19), 16423\u201316440.","journal-title":"Neural Computing and Applications"},{"key":"1915_CR2","unstructured":"Alhashim, I., & Wonka, P. (2018). High quality monocular depth estimation via transfer learning. arXiv preprint arXiv:1812.11941"},{"key":"1915_CR3","doi-asserted-by":"crossref","unstructured":"Atapour-Abarghouei, A., & Breckon, T. P. (2018). Real-time monocular depth estimation using synthetic data with domain adaptation via image style transfer. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 2800\u20132810).","DOI":"10.1109\/CVPR.2018.00296"},{"key":"1915_CR4","unstructured":"Bhat, S. F., Alhashim, I., & Wonka, P. (2021). Adabins: Depth estimation using adaptive bins. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 4009\u20134018)."},{"key":"1915_CR5","doi-asserted-by":"crossref","unstructured":"Butler, D. J., Wulff, J., Stanley, G. B., & Black, M. J. (2012). A naturalistic open source movie for optical flow evaluation. In European conference on computer vision (ECCV) (pp. 611\u2013625).","DOI":"10.1007\/978-3-642-33783-3_44"},{"key":"1915_CR6","unstructured":"Cabon, Y., Murray, N., & Humenberger, M. (2020). Virtual KITTI 2. arXiv preprint arXiv:2001.10773"},{"key":"1915_CR7","unstructured":"Chen, W., Fu, Z., Yang, D., & Deng, J. (2016). Single-image depth perception in the wild. In Advances in neural information processing systems (NeurIPS) (vol.\u00a029)."},{"key":"1915_CR8","doi-asserted-by":"crossref","unstructured":"Cordts, M., Omran, M., Ramos, S., Rehfeld, T., Enzweiler, M., Benenson, R., Franke, U., Roth, S., & Schiele, B. (2016). The cityscapes dataset for semantic urban scene understanding. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 3213\u20133223).","DOI":"10.1109\/CVPR.2016.350"},{"key":"1915_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L. J., Li, K., & Fei-Fei, L. (2009). ImageNet: A large-scale hierarchical image database. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 248\u2013255).","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1915_CR10","doi-asserted-by":"crossref","unstructured":"Dong, X., Bao, J., Chen, D., Zhang, W., Yu, N., Yuan, L., Chen, D., & Guo, B. (2022). CSWin transformer: A general vision transformer backbone with cross-shaped windows. In IEEE conference on computer vision and pattern recognition (CVPR) (pp 12124\u201312134).","DOI":"10.1109\/CVPR52688.2022.01181"},{"key":"1915_CR11","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., & Houlsby, N. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In International conference on learning representations (ICLR)."},{"key":"1915_CR12","unstructured":"Eigen, D., Puhrsch, C., & Fergus, R. (2014). Depth map prediction from a single image using a multi-scale deep network. In Advances in neural information processing systems (NeurIPS) (vol.\u00a027)."},{"key":"1915_CR13","doi-asserted-by":"crossref","unstructured":"Facil, J. M., Ummenhofer, B., Zhou, H., Montesano, L., Brox, T., & Civera, J. (2019). CAM-convs: Camera-aware multi-scale convolutions for single-view depth. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 11826\u201311835).","DOI":"10.1109\/CVPR.2019.01210"},{"key":"1915_CR14","doi-asserted-by":"crossref","unstructured":"Fu, H., Gong, M., Wang, C., Batmanghelich, K., & Tao, D. (2018). Deep ordinal regression network for monocular depth estimation. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 2002\u20132011).","DOI":"10.1109\/CVPR.2018.00214"},{"key":"1915_CR15","doi-asserted-by":"crossref","unstructured":"Gaidon, A., Wang, Q., Cabon, Y., & Vig, E. (2016). Virtual worlds as proxy for multi-object tracking analysis. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 4340\u20134349).","DOI":"10.1109\/CVPR.2016.470"},{"issue":"11","key":"1915_CR16","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1177\/0278364913491297","volume":"32","author":"A Geiger","year":"2013","unstructured":"Geiger, A., Lenz, P., Stiller, C., & Urtasun, R. (2013). Vision meets robotics: The KITTI dataset. The International Journal of Robotics Research, 32(11), 1231\u20131237.","journal-title":"The International Journal of Robotics Research"},{"key":"1915_CR17","doi-asserted-by":"crossref","unstructured":"Godard, C., Mac\u00a0Aodha, O., & Brostow, G. J. (2017). Unsupervised monocular depth estimation with left-right consistency. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 270\u2013279).","DOI":"10.1109\/CVPR.2017.699"},{"key":"1915_CR18","doi-asserted-by":"crossref","unstructured":"Godard, C., Mac\u00a0Aodha, O., Firman, M., & Brostow, G. J. (2019). Digging into self-supervised monocular depth estimation. In IEEE international conference on computer vision (ICCV) (pp. 3828\u20133838).","DOI":"10.1109\/ICCV.2019.00393"},{"key":"1915_CR19","unstructured":"Gta5-depth-estimation, Retrieved July 26, 2022. https:\/\/github.com\/gta5-vision\/GTA5-depth-estimation"},{"key":"1915_CR20","unstructured":"Han, K., Wang, Y., Guo, J., Tang, Y., & Wu, E. (2022). Vision GNN: An image is worth graph of nodes. arXiv preprint arXiv:2206.00272"},{"key":"1915_CR21","doi-asserted-by":"crossref","unstructured":"He, M., Hui, L., Bian, Y., Ren, J., Xie, J., & Yang, J. (2022). RA-depth: Resolution adaptive self-supervised monocular depth estimation. In European conference on computer vision (ECCV) (pp. 565\u2013581).","DOI":"10.1007\/978-3-031-19812-0_33"},{"key":"1915_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"key":"1915_CR23","unstructured":"Hua, Y., Kohli, P., Uplavikar, P., Ravi, A., Gunaseelan, S., Orozco, J., & Li, E. (2020). Holopix50k: A large-scale in-the-wild stereo image dataset. arXiv preprint arXiv:2003.11172"},{"key":"1915_CR24","doi-asserted-by":"crossref","unstructured":"Huang, G., Liu, Z., Van Der\u00a0Maaten, L., & Weinberger, K. Q. (2017). Densely connected convolutional networks. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 4700\u20134708).","DOI":"10.1109\/CVPR.2017.243"},{"key":"1915_CR25","doi-asserted-by":"crossref","unstructured":"Hurl, B., Czarnecki, K., & Waslander, S. (2019). Precise synthetic image and LiDAR (PreSIL) dataset for autonomous vehicle perception. In IEEE intelligent vehicles symposium (IV) (pp. 2522\u20132529).","DOI":"10.1109\/IVS.2019.8813809"},{"key":"1915_CR26","doi-asserted-by":"crossref","unstructured":"Ji, P., Li, R., Bhanu, B., & Xu, Y. (2021). MonoIndoor: Towards good practice of self-supervised monocular depth estimation for indoor environments. In IEEE international conference on computer vision (ICCV) (pp. 12787\u201312796).","DOI":"10.1109\/ICCV48922.2021.01255"},{"issue":"11","key":"1915_CR27","doi-asserted-by":"publisher","first-page":"5227","DOI":"10.1109\/TIP.2016.2601262","volume":"25","author":"Y Kim","year":"2016","unstructured":"Kim, Y., Ham, B., Oh, C., & Sohn, K. (2016). Structure selective depth superresolution for RGB-D cameras. IEEE Transactions on Image Processing (TIP), 25(11), 5227\u20135238.","journal-title":"IEEE Transactions on Image Processing (TIP)"},{"key":"1915_CR28","doi-asserted-by":"crossref","unstructured":"Kopf, J., Rong, X., & Huang, J. B. (2021). Robust consistent video depth estimation. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 1611\u20131621).","DOI":"10.1109\/CVPR46437.2021.00166"},{"key":"1915_CR29","doi-asserted-by":"crossref","unstructured":"Laina, I., Rupprecht, C., Belagiannis, V., Tombari, F., & Navab, N. (2016). Deeper depth prediction with fully convolutional residual networks. In International conference on 3D vision (3DV) (pp. 239\u2013248).","DOI":"10.1109\/3DV.2016.32"},{"key":"1915_CR30","doi-asserted-by":"crossref","unstructured":"Le, H. A., Mensink, T., Das, P., Karaoglu, S., & Gevers, T. (2021) EDEN: Multimodal synthetic dataset of enclosed garden scenes. In IEEE winter conference on applications of computer vision (WACV) (pp. 1579\u20131589).","DOI":"10.1109\/WACV48630.2021.00162"},{"key":"1915_CR31","unstructured":"Lee, J. H., Han, M. K., Ko, D. W., & Suh, I. H. (2019). From big to small: Multi-scale local planar guidance for monocular depth estimation. arXiv preprint arXiv:1907.10326"},{"key":"1915_CR32","doi-asserted-by":"crossref","unstructured":"Li, Z., & Snavely, N. (2018). MegaDepth: Learning single-view depth prediction from internet photos. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 2041\u20132050).","DOI":"10.1109\/CVPR.2018.00218"},{"key":"1915_CR33","doi-asserted-by":"crossref","unstructured":"Li, Z., Dekel, T., Cole, F., Tucker, R., Snavely, N., Liu, C., & Freeman, W. T. (2019). Learning the depths of moving people by watching frozen people. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 4521\u20134530).","DOI":"10.1109\/CVPR.2019.00465"},{"key":"1915_CR34","doi-asserted-by":"crossref","unstructured":"Li, B., Huang, Y., Liu, Z., Zou, D., & Yu, W. (2021). StructDepth: Leveraging the structural regularities for self-supervised indoor depth estimation. In IEEE international conference on computer vision (ICCV) (pp. 12663\u201312673).","DOI":"10.1109\/ICCV48922.2021.01243"},{"key":"1915_CR35","doi-asserted-by":"crossref","unstructured":"Liu, Z., Hu, H., Lin, Y., Yao, Z., Xie, Z., Wei, Y., Ning, J., Cao, Y., Zhang, Z., Dong, L., Wei, F., & Guo, B. (2022). Swin transformer v2: Scaling up capacity and resolution. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 12009\u201312019).","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"1915_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In IEEE international conference on computer vision (ICCV) (pp. 10012\u201310022).","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"1915_CR37","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.Y., Feichtenhofer, C., Darrell, T., & Xie, S. (2022). A convnet for the 2020s. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 11976\u201311986).","DOI":"10.1109\/CVPR52688.2022.01167"},{"issue":"4","key":"1915_CR38","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1145\/3386569.3392377","volume":"39","author":"X Luo","year":"2020","unstructured":"Luo, X., Huang, J. B., Szeliski, R., Matzen, K., & Kopf, J. (2020). Consistent video depth estimation. ACM Transactions on Graphics (ToG), 39(4), 71\u20131.","journal-title":"ACM Transactions on Graphics (ToG)"},{"key":"1915_CR39","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1016\/j.neucom.2022.10.073","volume":"517","author":"A Masoumian","year":"2023","unstructured":"Masoumian, A., Rashwan, H. A., Abdulwahab, S., Cristiano, J., Asif, M. S., & Puig, D. (2023). GCNDepth: Self-supervised monocular depth estimation based on graph convolutional network. Neurocomputing, 517, 81\u201392.","journal-title":"Neurocomputing"},{"issue":"14","key":"1915_CR40","doi-asserted-by":"publisher","first-page":"5353","DOI":"10.3390\/s22145353","volume":"22","author":"A Masoumian","year":"2022","unstructured":"Masoumian, A., Rashwan, H. A., Cristiano, J., Asif, M. S., & Puig, D. (2022). Monocular depth estimation using deep learning: A review. Sensors, 22(14), 5353.","journal-title":"Sensors"},{"key":"1915_CR41","unstructured":"Mehta, S., & Rastegari, M. (2021). MobileViT: Light-weight, general-purpose, and mobile-friendly vision transformer. In International conference on learning representations (ICLR)."},{"key":"1915_CR42","doi-asserted-by":"crossref","unstructured":"Miangoleh, S. M. H., Dille, S., Mai, L., Paris, S., & Aksoy, Y. (2021). Boosting monocular depth estimation models to high-resolution via content-adaptive multi-resolution merging. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 9685\u20139694)","DOI":"10.1109\/CVPR46437.2021.00956"},{"key":"1915_CR43","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.neucom.2020.12.089","volume":"438","author":"Y Ming","year":"2021","unstructured":"Ming, Y., Meng, X., Fan, C., & Yu, H. (2021). Deep learning for monocular depth estimation: A review. Neurocomputing, 438, 14\u201333.","journal-title":"Neurocomputing"},{"key":"1915_CR44","doi-asserted-by":"crossref","unstructured":"Quinonero-Candela, J., Sugiyama, M., Schwaighofer, A., & Lawrence, N. D. (2008). Dataset shift in machine learning. MIT Press.","DOI":"10.7551\/mitpress\/9780262170055.001.0001"},{"key":"1915_CR45","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Bochkovskiy, A., & Koltun, V. (2021). Vision transformers for dense prediction. In IEEE international conference on computer vision (ICCV) (pp. 12179\u201312188).","DOI":"10.1109\/ICCV48922.2021.01196"},{"issue":"3","key":"1915_CR46","doi-asserted-by":"publisher","first-page":"1623","DOI":"10.1109\/TPAMI.2020.3019967","volume":"44","author":"R Ranftl","year":"2020","unstructured":"Ranftl, R., Lasinger, K., Hafner, D., Schindler, K., & Koltun, V. (2020). Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 44(3), 1623\u20131637.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"1915_CR47","doi-asserted-by":"crossref","unstructured":"Ren, H., Raj, A., El-Khamy, M., & Lee, J. (2020). SUW-Learn: Joint supervised, unsupervised, weakly supervised deep learning for monocular depth estimation. In IEEE conference on computer vision and pattern recognition (CVPR) workshop (pp. 750\u2013751).","DOI":"10.1109\/CVPRW50498.2020.00383"},{"key":"1915_CR48","doi-asserted-by":"crossref","unstructured":"Richter, S. R., Hayder, Z., & Koltun, V. (2017). Playing for benchmarks. In IEEE international conference on computer vision (ICCV) (pp. 2232\u20132241).","DOI":"10.1109\/ICCV.2017.243"},{"issue":"5","key":"1915_CR49","doi-asserted-by":"publisher","first-page":"824","DOI":"10.1109\/TPAMI.2008.132","volume":"31","author":"A Saxena","year":"2008","unstructured":"Saxena, A., Sun, M., & Ng, A. Y. (2008). Make3D: Learning 3D scene structure from a single still image. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 31(5), 824\u2013840.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"1915_CR50","doi-asserted-by":"crossref","unstructured":"Schonberger, J. L., & Frahm, J. M. (2016). Structure-from-motion revisited. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 4104\u20134113).","DOI":"10.1109\/CVPR.2016.445"},{"key":"1915_CR51","doi-asserted-by":"crossref","unstructured":"Silberman, N., Hoiem, D., Kohli, P., & Fergus, R. (2012). Indoor segmentation and support inference from RGBD images. In European conference on computer vision (ECCV) (pp. 746\u2013760).","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"1915_CR52","unstructured":"Tan, M., & Le, Q. (2019). EfficientNet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning (ICML) (pp. 6105\u20136114)."},{"key":"1915_CR53","doi-asserted-by":"crossref","unstructured":"Teed, Z., & Deng, J. (2020). RAFT: Recurrent all-pairs field transforms for optical flow. In European conference on computer vision (ECCV) (pp. 402\u2013419).","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"1915_CR54","unstructured":"The robust vision challenge (2022). http:\/\/www.robustvision.net"},{"key":"1915_CR55","doi-asserted-by":"crossref","unstructured":"Torralba, A., & Efros, A. A. (2011). Unbiased look at dataset bias. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 1521\u20131528).","DOI":"10.1109\/CVPR.2011.5995347"},{"key":"1915_CR56","doi-asserted-by":"crossref","unstructured":"Ummenhofer, B., Zhou, H., Uhrig, J., Mayer, N., Ilg, E., Dosovitskiy, A., & Brox, T. (2017). DeMoN: Depth and motion network for learning monocular stereo. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 5038\u20135047).","DOI":"10.1109\/CVPR.2017.596"},{"key":"1915_CR57","unstructured":"Van Den\u00a0Oord, A., & Vinyals, O. (2017). Neural discrete representation learning. In Advances in neural information processing systems (NeurIPS) (vol.\u00a030)."},{"key":"1915_CR58","unstructured":"Vasiljevic, I., Kolkin, N., Zhang, S., Luo, R., Wang, H., Dai, F. Z., Daniele, A. F., Mostajabi, M., Basart, S., & Walter, M. R., Shakhnarovich, G. (2019). Diode: A dense indoor and outdoor depth dataset. arXiv preprint arXiv:1908.00463"},{"key":"1915_CR59","unstructured":"Vyas, P., Saxena, C., Badapanda, A., & Goswami, A. (2022). Outdoor monocular depth estimation: A research review. arXiv preprint arXiv:2205.01399"},{"key":"1915_CR60","doi-asserted-by":"crossref","unstructured":"Wang, C., Lucey, S., Perazzi, F., & Wang, O. (2019). Web stereo video supervision for depth prediction from dynamic scenes. In International conference on 3D vision (3DV) (pp. 348\u2013357).","DOI":"10.1109\/3DV.2019.00046"},{"key":"1915_CR61","doi-asserted-by":"crossref","unstructured":"Wang, X., Yin, W., Kong, T., Jiang, Y., Li, L., & Shen, C. (2020). Task-aware monocular depth estimation for 3d object detection. In AAAI conference on artificial intelligence (AAAI) (vol.\u00a034, pp. 12257\u201312264).","DOI":"10.1609\/aaai.v34i07.6908"},{"key":"1915_CR62","doi-asserted-by":"crossref","unstructured":"Wu, C.Y., Wang, J., Hall, M., Neumann, U., & Su, S. (2022). Toward practical monocular indoor depth estimation. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 3814\u20133824).","DOI":"10.1109\/CVPR52688.2022.00379"},{"key":"1915_CR63","doi-asserted-by":"crossref","unstructured":"Xian, K., Shen, C., Cao, Z., Lu, H., Xiao, Y., Li, R., & Luo, Z. (2018). Monocular relative depth perception with web stereo data supervision. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 311\u2013320).","DOI":"10.1109\/CVPR.2018.00040"},{"key":"1915_CR64","doi-asserted-by":"crossref","unstructured":"Xian, K., Zhang, J., Wang, O., Mai, L., Lin, Z., & Cao, Z. (2020). Structure-guided ranking loss for single image depth prediction. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 611\u2013620).","DOI":"10.1109\/CVPR42600.2020.00069"},{"key":"1915_CR65","unstructured":"Xu, G., Yin, W., Chen, H., Cheng, K., Zhao, F., & Shen, C. (2022). Boosting monocular depth estimation with sparse guided points. arXiv preprint arXiv:2202.01470"},{"key":"1915_CR66","doi-asserted-by":"crossref","unstructured":"Xu, G., Yin, W., Chen, H., Shen, C., Cheng, K., & Zhao, F. (2023). Pose-free 3d scene reconstruction with frozen depth models. In IEEE international conference on computer vision (ICCV).","DOI":"10.1109\/ICCV51070.2023.00854"},{"key":"1915_CR67","doi-asserted-by":"crossref","unstructured":"Yin, W., Zhang, C., Chen, H., Cai, Z., Yu, G., Wang, K., Chen, X., & Shen, C. (2023). Metric3D: Towards zero-shot metric 3d prediction from a single image. In IEEE international conference on computer vision (ICCV).","DOI":"10.1109\/ICCV51070.2023.00830"},{"key":"1915_CR68","doi-asserted-by":"crossref","unstructured":"Yin, W., Zhang, J., Wang, O., Niklaus, S., Mai, L., Chen, S., & Shen, C. (2021). Learning to recover 3d scene shape from a single image. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 204\u2013213).","DOI":"10.1109\/CVPR46437.2021.00027"},{"issue":"10","key":"1915_CR69","doi-asserted-by":"publisher","first-page":"7282","DOI":"10.1109\/TPAMI.2021.3097396","volume":"44","author":"W Yin","year":"2021","unstructured":"Yin, W., Liu, Y., & Shen, C. (2021). Virtual normal: Enforcing geometric constraints for accurate and robust depth prediction. IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 44(10), 7282\u20137295.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"1915_CR70","doi-asserted-by":"crossref","unstructured":"Yuan, W., Gu, X., Dai, Z., Zhu, S., & Tan, P. (2022). Neural window fully-connected CRFs for monocular depth estimation. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 3916\u20133925).","DOI":"10.1109\/CVPR52688.2022.00389"},{"key":"1915_CR71","doi-asserted-by":"crossref","unstructured":"Zhan, H., Garg, R., Weerasekera, C. S., Li, K., Agarwal, H., & Reid, I. (2018). Unsupervised learning of monocular depth estimation and visual odometry with deep feature reconstruction. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 340\u2013349).","DOI":"10.1109\/CVPR.2018.00043"},{"key":"1915_CR72","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Lathuiliere, S., Ricci, E., Sebe, N., Yan, Y., & Yang, J. (2020). Online depth learning against forgetting in monocular videos. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 4494\u20134503).","DOI":"10.1109\/CVPR42600.2020.00455"},{"key":"1915_CR73","doi-asserted-by":"crossref","unstructured":"Zhao, S., Fu, H., Gong, M., & Tao, D. (2019). Geometry-aware symmetric domain adaptation for monocular depth estimation. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 9788\u20139798).","DOI":"10.1109\/CVPR.2019.01002"},{"key":"1915_CR74","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., & Jia, J. (2017). Pyramid scene parsing network. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 2881\u20132890).","DOI":"10.1109\/CVPR.2017.660"},{"key":"1915_CR75","doi-asserted-by":"crossref","unstructured":"Zhao, C., Zhang, Y., Poggi, M., Tosi, F., Guo, X., Zhu, Z., Huang, G., Tang, Y., & Mattoccia, S. (2022). MonoViT: Self-supervised monocular depth estimation with a vision transformer. In 2022 international conference on 3D vision (3DV) (pp. 668\u2013678). IEEE","DOI":"10.1109\/3DV57658.2022.00077"},{"issue":"9","key":"1915_CR76","doi-asserted-by":"publisher","first-page":"1612","DOI":"10.1007\/s11431-020-1582-8","volume":"63","author":"C Zhao","year":"2020","unstructured":"Zhao, C., Sun, Q., Zhang, C., Tang, Y., & Qian, F. (2020). Monocular depth estimation based on deep learning: An overview. Science China Technological Sciences, 63(9), 1612\u20131627.","journal-title":"Science China Technological Sciences"},{"issue":"5","key":"1915_CR77","doi-asserted-by":"publisher","first-page":"1237","DOI":"10.1109\/TETCI.2022.3182360","volume":"6","author":"C Zhao","year":"2022","unstructured":"Zhao, C., Tang, Y., & Sun, Q. (2022). Unsupervised monocular depth estimation in highly complex environments. IEEE Transactions on Emerging Topics in Computational Intelligence, 6(5), 1237\u20131246.","journal-title":"IEEE Transactions on Emerging Topics in Computational Intelligence"},{"key":"1915_CR78","doi-asserted-by":"crossref","unstructured":"Zheng, C., Cham, T. J., & Cai, J. (2018). T2Net: Synthetic-to-realistic translation for solving single-image depth estimation tasks. In European conference on computer vision (ECCV) (pp. 767\u2013783).","DOI":"10.1007\/978-3-030-01234-2_47"},{"key":"1915_CR79","doi-asserted-by":"crossref","unstructured":"Zhou, Z., & Dong, Q. (2022). Self-distilled feature aggregation for self-supervised monocular depth estimation. In European conference on computer vision (ECCV) (pp. 709\u2013726).","DOI":"10.1007\/978-3-031-19769-7_41"},{"key":"1915_CR80","doi-asserted-by":"crossref","unstructured":"Zhou, T., Brown, M., Snavely, N., & Lowe, D. G. (2017) Unsupervised learning of depth and ego-motion from video. In IEEE conference on computer vision and pattern recognition (CVPR) (pp. 1851\u20131858).","DOI":"10.1109\/CVPR.2017.700"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-023-01915-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-023-01915-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-023-01915-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,26]],"date-time":"2024-03-26T11:09:07Z","timestamp":1711451347000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-023-01915-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,23]]},"references-count":80,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,4]]}},"alternative-id":["1915"],"URL":"https:\/\/doi.org\/10.1007\/s11263-023-01915-6","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10,23]]},"assertion":[{"value":"17 December 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 September 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 October 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}