{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:28:10Z","timestamp":1777865290795,"version":"3.51.4"},"reference-count":147,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"NSFC","doi-asserted-by":"publisher","award":["62302246"],"award-info":[{"award-number":["62302246"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004731","name":"ZJNSFC","doi-asserted-by":"publisher","award":["LQ23F010008"],"award-info":[{"award-number":["LQ23F010008"]}],"id":[{"id":"10.13039\/501100004731","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00629","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"6678-6692","source":"Crossref","is-referenced-by-count":0,"title":["Hybrid-Grained Feature Aggregation with Coarse-to-Fine Language Guidance for Self-Supervised Monocular Depth Estimation"],"prefix":"10.1109","author":[{"given":"Wenyao","family":"Zhang","sequence":"first","affiliation":[{"name":"AI Institute, Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence"}]},{"given":"Hongsi","family":"Liu","sequence":"additional","affiliation":[{"name":"Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China"}]},{"given":"Bohan","family":"Li","sequence":"additional","affiliation":[{"name":"AI Institute, Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence"}]},{"given":"Jiawei","family":"He","sequence":"additional","affiliation":[{"name":"CASIA"}]},{"given":"Zekun","family":"Qi","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Yunnan","family":"Wang","sequence":"additional","affiliation":[{"name":"AI Institute, Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence"}]},{"given":"Shengyang","family":"Zhao","sequence":"additional","affiliation":[{"name":"Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China"}]},{"given":"Xinqiang","family":"Yu","sequence":"additional","affiliation":[{"name":"CASIA"}]},{"given":"Wenjun","family":"Zeng","sequence":"additional","affiliation":[{"name":"Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China"}]},{"given":"Xin","family":"Jin","sequence":"additional","affiliation":[{"name":"Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00218"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25090"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00400"},{"key":"ref4","article-title":"Zoedepth: Zero-shot transfer by combining relative and metric depth","author":"Bhat","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01484-6"},{"key":"ref6","article-title":"Depth pro: Sharp monocular metric depth in less than a second","author":"Bochkovskii","year":"2024","journal-title":"arXiv"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018001"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00823"},{"key":"ref11","article-title":"Prompt learning with op-timal transport for vision-language models","author":"Chen","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref12","article-title":"D 3 epth: Self-supervised depth estimation with dynamic mask in dynamic scenes","author":"Chen","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3412632"},{"key":"ref14","article-title":"Autoencoders as cross-modal teachers: Can pretrained 2d image transformers help 3d representation learning?","volume-title":"The Eleventh International Conference on Learning Representations (ICLR)","author":"Dong","year":"2023"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01058"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27927"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610688"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415881"},{"key":"ref19","article-title":"Depth map prediction from a single image using a multi-scale deep network","volume":"27","author":"Eigen","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02059"},{"key":"ref21","first-page":"arXiv-2411","article-title":"Clip meets dino for tuning zero-shot classifier using unlabeled image collections","author":"Imam","year":"2024","journal-title":"arXiv e-prints"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2025.3533207"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_14"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00214"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913491297"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.699"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00393"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3374045"},{"key":"ref29","first-page":"12626","article-title":"Forget about the lidar: Self-supervised depth estimators with med probability volumes","volume":"33","author":"Gonzalez-Bello","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref30","article-title":"Open-vocabulary detection via vision and language knowledge distillation","author":"Gu","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00256"},{"key":"ref32","article-title":"Semantically-guided representation learning for self-supervised monocular depth","author":"Guizilini","year":"2020","journal-title":"ICLR"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00026"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/3dv66043.2025.00016"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72751-1_9"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_34"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-006-0031-y"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3444912"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00550"},{"key":"ref42","article-title":"Bevdet: High-performance multi-camera 3d object detection in bird-eye-view","author":"Huang","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01255"},{"key":"ref44","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"International Conference on Machine Learning","author":"Jia","year":"2021"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"ref46","article-title":"From clip to dino: Visual encoders shout in multi-modal large language models","author":"Jiang","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_4"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00481"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00907"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.00880"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.112475"},{"key":"ref53","article-title":"Openvla: An open-source vision-language-action model","author":"Kim","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_35"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00333"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"ref57","article-title":"Bridging stereo geometry and bev representation with reliable mutual interaction for semantic scene completion","author":"Li","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref58","first-page":"131","article-title":"Hierarchical temporal context learning for camera-based semantic scene completion","volume-title":"European Conference on Computer Vision","author":"Li","year":"2024"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01118"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28085"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25225"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00218"},{"key":"ref64","article-title":"Binsformer: Revisiting adaptive bins for monocular depth estimation","author":"Li","year":"2022","journal-title":"arXiv:2204.00987"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00637"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00283"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-88690-7_3"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72995-9_6"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"ref73","article-title":"SegCLIP: Patch aggregation with learnable centers for open-vocabulary semantic segmentation","author":"Luo","year":"2023","journal-title":"ICML"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16329"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00994"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TAI.2021.3115401"},{"key":"ref77","article-title":"Dinov2: Learning robust visual features without supervision","author":"Oquab","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00941"},{"key":"ref79","author":"Paszke","year":"2017","journal-title":"Automatic differentiation in pytorch"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02672"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1250"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01590"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00963"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00329"},{"key":"ref85","first-page":"28223","article-title":"Contrast with reconstruct: Contrastive 3d representation learning guided by generative pretraining","volume-title":"International Conference on Machine Learning","author":"Qi","year":"2023"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72775-7_13"},{"key":"ref87","article-title":"Sofar: Language-grounded orientation bridges spatial reasoning and object manipulation","author":"Qi","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref88","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3019967"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00818"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2008.132"},{"key":"ref94","article-title":"The surprising effectiveness of diffusion models for optical flow and monocular depth estimation","volume":"36","author":"Saxena","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00729"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2024.3509619"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58529-7_34"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.00882"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00458"},{"key":"ref100","article-title":"Sparse autoencoders for scientifically rigorous interpretation of vision models","author":"Stevens","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00914"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_27"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610318"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00283"},{"key":"ref105","article-title":"Depth anywhere: Enhancing 360 monocular depth estimation via perspective distillation and unlabeled data augmentation","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02052"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2019.00864"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01919"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28383"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00225"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00122"},{"key":"ref113","article-title":"D3 roma: Disparity diffusion-based depth sensing for material-agnostic robotic manipulation","author":"Wei","year":"2024","journal-title":"ECCV 2024 Workshop on Wild 3D: 3D Modeling, Reconstruction, and Generation in the Wild"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8794182"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00040"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"ref117","article-title":"Depth anything v2","author":"Yang","year":"2024","journal-title":"arXiv:2406.09414"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2024.106410"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2024.01.004"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00578"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00830"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00212"},{"key":"ref123","article-title":"Pseudo-lidar++: Accurate depth for 3d object detection in autonomous driving","author":"You","year":"2020","journal-title":"ICLR"},{"key":"ref124","article-title":"Priordiffusion: Leverage language prior in diffusion models for monocular depth estimation","author":"Zeng","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00927"},{"key":"ref126","article-title":"Rsa: Resolving scale ambiguities in monocular depth estimators through language descriptions","author":"Zeng","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3543191"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01778"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3549201"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3211153"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/ICARM54641.2022.9959694"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2024.3521785"},{"key":"ref134","article-title":"Dreamvla: A vision-language-action model dreamed with comprehensive world knowledge","author":"Zhang","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/3DV57658.2022.00077"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01485"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/TCI.2016.2644865"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.5244\/c.35.208"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.700"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/CASE49997.2022.9926478"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00098"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01313"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00249"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2025.3544336"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444273.pdf?arnumber=11444273","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:11:32Z","timestamp":1777529492000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444273\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":147,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00629","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}