{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,24]],"date-time":"2025-08-24T01:14:20Z","timestamp":1755998060470,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":35,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819609628"},{"type":"electronic","value":"9789819609635"}],"license":[{"start":{"date-parts":[[2024,12,8]],"date-time":"2024-12-08T00:00:00Z","timestamp":1733616000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,8]],"date-time":"2024-12-08T00:00:00Z","timestamp":1733616000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0963-5_17","type":"book-chapter","created":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T07:45:36Z","timestamp":1733557536000},"page":"287-302","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["DepthBLIP-2: Leveraging Language to\u00a0Guide BLIP-2 in\u00a0Understanding Depth Information"],"prefix":"10.1007","author":[{"given":"Wei","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changyong","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chuanxiang","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenhao","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shulei","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,8]]},"reference":[{"key":"17_CR1","unstructured":"Bhat, S.F., Alhashim, I., Wonka, P.: Adabins: Depth estimation using adaptive bins. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 4009\u20134018 (2021)"},{"key":"17_CR2","unstructured":"Chang, W., Zhang, Y., Xiong, Z.: Transformer-based monocular depth estimation with attention supervision. In: BMVC. vol.\u00a06, p.\u00a07 (2021)"},{"key":"17_CR3","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: International Conference on Machine Learning (2021)"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Fu, H., Gong, M., Wang, C., Batmanghelich, K., Tao, D.: Deep ordinal regression network for monocular depth estimation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 2002\u20132011 (2018)","DOI":"10.1109\/CVPR.2018.00214"},{"key":"17_CR5","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1177\/0278364913491297","volume":"32","author":"A Geiger","year":"2013","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R.: Vision meets robotics: The kitti dataset. The International Journal of Robotics Research 32, 1231\u20131237 (2013)","journal-title":"The International Journal of Robotics Research"},{"key":"17_CR6","unstructured":"Gu, X., Lin, T., Kuo, W., Cui, Y.: Zero-shot detection via vision and language knowledge distillation. CoRR abs\/2104.13921 (2021)"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"17_CR8","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q.V., Sung, Y.H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning (2021)"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Jiang, H., Ding, L., Hu, J., Huang, R.: Plnet: Plane and line priors for unsupervised indoor depth estimation. In: 2021 International Conference on 3D Vision (3DV). pp. 741\u2013750. IEEE (2021)","DOI":"10.1109\/3DV53792.2021.00083"},{"key":"17_CR10","unstructured":"Kim, W., Son, B., Kim, I.: Vilt: Vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning (2021), https:\/\/api.semanticscholar.org\/CorpusID:231839613"},{"key":"17_CR11","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Commun. ACM 60, 84\u201390 (2012)","journal-title":"Commun. ACM"},{"key":"17_CR12","unstructured":"Li, B., Weinberger, K.Q., Belongie, S.J., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. CoRR abs\/2201.03546 (2022)"},{"key":"17_CR13","doi-asserted-by":"crossref","unstructured":"Li, D., Li, J., Le, H., Wang, G., Savarese, S., Hoi, S.C.: LAVIS: A one-stop library for language-vision intelligence. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations). pp. 31\u201341. Association for Computational Linguistics, Toronto, Canada (Jul 2023)","DOI":"10.18653\/v1\/2023.acl-demo.3"},{"key":"17_CR14","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.C.H.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning (2023)"},{"key":"17_CR15","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.C.H.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning (2022)"},{"key":"17_CR16","unstructured":"Li, J., Selvaraju, R.R., Gotmare, A.D., Joty, S.R., Xiong, C., Hoi, S.C.H.: Align before fuse: Vision and language representation learning with momentum distillation. In: Neural Information Processing Systems (2021)"},{"key":"17_CR17","doi-asserted-by":"crossref","unstructured":"Li, L.H., Zhang, P., Zhang, H., Yang, J., Li, C., Zhong, Y., Wang, L., Yuan, L., Zhang, L., Hwang, J.N., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"17_CR18","doi-asserted-by":"publisher","first-page":"837","DOI":"10.1007\/s11633-023-1458-0","volume":"20","author":"Z Li","year":"2022","unstructured":"Li, Z., Chen, Z., Liu, X., Jiang, J.: Depthformer: Exploiting long-range correlation and local information for accurate monocular depth estimation. Machine Intelligence Research 20, 837\u2013854 (2022)","journal-title":"Machine Intelligence Research"},{"key":"17_CR19","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2017), https:\/\/api.semanticscholar.org\/CorpusID:53592270"},{"key":"17_CR20","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2021","unstructured":"Luo, H., Ji, L., Zhong, M., Chen, Y., Lei, W., Duan, N., Li, T.: Clip4clip: An empirical study of clip for end to end video clip retrieval. Neurocomputing 508, 293\u2013304 (2021)","journal-title":"Neurocomputing"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Mahjourian, R., Wicke, M., Angelova, A.: Unsupervised learning of depth and ego-motion from monocular video using 3d geometric constraints. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 5667\u20135675 (2018)","DOI":"10.1109\/CVPR.2018.00594"},{"key":"17_CR22","unstructured":"Mel, M., Siddiqui, M.I., Zanuttigh, P.: End-to-end learning for joint depth and image reconstruction from diffracted rotation. The Visual Computer pp. 1\u201317 (2022)"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Nathan\u00a0Silberman, Derek\u00a0Hoiem, P.K., Fergus, R.: Indoor segmentation and support inference from rgbd images. In: ECCV (2012)","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"17_CR24","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"17_CR25","unstructured":"Saxena, A., Sun, M., Ng, A.Y.: Make3d: Depth perception from a single still image. In: Aaai. vol.\u00a03, pp. 1571\u20131576 (2008)"},{"key":"17_CR26","doi-asserted-by":"publisher","first-page":"441","DOI":"10.1080\/09500340.2014.967321","volume":"62","author":"C Tang","year":"2015","unstructured":"Tang, C., Hou, C., Song, Z.: Depth recovery and refinement from a single image using defocus cues. J. Mod. Opt. 62, 441\u2013448 (2015)","journal-title":"J. Mod. Opt."},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Tsai, Y.M., Chang, Y.L., Chen, L.G.: Block-based vanishing line and vanishing point detection for 3d scene reconstruction. In: 2006 international symposium on intelligent signal processing and communications. pp. 586\u2013589. IEEE (2005)","DOI":"10.1109\/ISPACS.2006.364726"},{"key":"17_CR28","unstructured":"Vaswani, A., Shazeer, N.M., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. In: Neural Information Processing Systems (2017)"},{"key":"17_CR29","unstructured":"Vyas, P., Saxena, C., Badapanda, A., Goswami, A.: Outdoor monocular depth estimation: A research review. ArXiv abs\/2205.01399 (2022)"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Wang, W., Bao, H., Dong, L., Bjorck, J., Peng, Z., Liu, Q., Aggarwal, K., Mohammed, O.K., Singhal, S., Som, S., Wei, F.: Image as a foreign language: Beit pretraining for all vision and vision-language tasks. ArXiv abs\/2208.10442 (2022)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"17_CR31","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: Simvlm: Simple visual language model pretraining with weak supervision. ArXiv abs\/2108.10904 (2021)"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Xu, J., De\u00a0Mello, S., Liu, S., Byeon, W., Breuel, T., Kautz, J., Wang, X.: Groupvit: Semantic segmentation emerges from text supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18134\u201318144 (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"17_CR33","doi-asserted-by":"publisher","first-page":"250","DOI":"10.1016\/j.neucom.2019.10.107","volume":"379","author":"M Zhang","year":"2020","unstructured":"Zhang, M., Ye, X., Fan, X., Zhong, W.: Unsupervised depth estimation from monocular videos with hybrid geometric-refined loss and contextual attention. Neurocomputing 379, 250\u2013261 (2020)","journal-title":"Neurocomputing"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, R., Zeng, Z., Guo, Z.: Can language understand depth? Proceedings of the 30th ACM International Conference on Multimedia (2022)","DOI":"10.1145\/3503161.3549201"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. International Journal of Computer Vision (IJCV) (2022)","DOI":"10.1007\/s11263-022-01653-1"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0963-5_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T08:40:20Z","timestamp":1733560820000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0963-5_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,8]]},"ISBN":["9789819609628","9789819609635"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0963-5_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,8]]},"assertion":[{"value":"8 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}