{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:54:35Z","timestamp":1781538875476,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Shanghai Municipal Commission of Economy and Informatization through the High-Quality Industrial Development Program","award":["240407"],"award-info":[{"award-number":["240407"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810594","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1251-1259","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Geometry-Guided Depth Correction for Metric Relative Pose Estimation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8888-151X","authenticated-orcid":false,"given":"Shibin","family":"Xie","sequence":"first","affiliation":[{"name":"Donghua University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7609-5089","authenticated-orcid":false,"given":"Hao","family":"Yin","sequence":"additional","affiliation":[{"name":"Donghua University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1206-4048","authenticated-orcid":false,"given":"Shuting","family":"Wang","sequence":"additional","affiliation":[{"name":"Donghua University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5769-0104","authenticated-orcid":false,"given":"Xiaokang","family":"Fang","sequence":"additional","affiliation":[{"name":"Donghua University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5756-9114","authenticated-orcid":false,"given":"Liang","family":"Jin","sequence":"additional","affiliation":[{"name":"Donghua University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9733-1563","authenticated-orcid":false,"given":"Haotian","family":"Liu","sequence":"additional","affiliation":[{"name":"Donghua University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6317-1956","authenticated-orcid":false,"given":"Yanting","family":"Zhang","sequence":"additional","affiliation":[{"name":"Donghua University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5217-3155","authenticated-orcid":false,"given":"Shen","family":"Cai","sequence":"additional","affiliation":[{"name":"Donghua University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Relja Arandjelovi\u0107 Petr Gronat Akihiko Torii Tomas Pajdla and Josef Sivic. 2018. NetVLAD: CNN Architecture for Weakly Supervised Place Recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence(TPAMI) 40 6 (2018) 1437\u20131451.","DOI":"10.1109\/TPAMI.2017.2711011"},{"key":"e_1_3_3_1_3_2","volume-title":"European Conference on Computer Vision(ECCV)","author":"Wynn\u00a0J Vicente S et\u00a0al. Arnold\u00a0E,","year":"2022","unstructured":"Vicente S et\u00a0al. Arnold\u00a0E, Wynn\u00a0J. 2022. Map-free visual relocalization: Metric pose relative to a single image. In European Conference on Computer Vision(ECCV)."},{"key":"e_1_3_3_1_4_2","volume-title":"European conference on computer vision (ECCV)","author":"Li\u00a0S Prisacariu\u00a0V. Balntas\u00a0V,","year":"2018","unstructured":"Prisacariu\u00a0V. Balntas\u00a0V, Li\u00a0S. 2018. Relocnet: Continuous metric learning relocalisation using neural nets. In European conference on computer vision (ECCV)."},{"key":"e_1_3_3_1_5_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Munukutla\u00a0S Prisacariu V A et\u00a0al. Barroso-Laguna\u00a0A,","year":"2024","unstructured":"Prisacariu V A et\u00a0al. Barroso-Laguna\u00a0A, Munukutla\u00a0S. 2024. Matching 2d images in 3d: Metric relative pose from metric correspondences. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_6_2","unstructured":"Wofk D et\u00a0al. Bhat S\u00a0F Birkl\u00a0R. 2023. Zoedepth: Zero-shot transfer by combining relative and metric depth. arXiv:https:\/\/arXiv.org\/abs\/2302.12288 (2023)."},{"key":"e_1_3_3_1_7_2","volume-title":"International Conference on Learning Representations(ICLR)","author":"Bochkovskii Aleksei","year":"2025","unstructured":"Aleksei Bochkovskii, Ama\u00ebl Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan\u00a0R. Richter, and Vladlen Koltun. 2025. Depth Pro: Sharp Monocular Metric Depth in Less Than a Second. In International Conference on Learning Representations(ICLR)."},{"key":"e_1_3_3_1_8_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Krull\u00a0A Nowozin S et\u00a0al. Brachmann\u00a0E,","year":"2017","unstructured":"Nowozin S et\u00a0al. Brachmann\u00a0E, Krull\u00a0A. 2017. Dsac-differentiable ransac for camera localization. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_9_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Brahmbhatt Samarth","year":"2018","unstructured":"Samarth Brahmbhatt, Jinwei Gu, Kihwan Kim, James Hays, and Jan Kautz. 2018. MapNet: Geometry-Aware Learning of Maps for Camera Localization. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01562"},{"key":"e_1_3_3_1_11_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Bhalgat\u00a0Y Li\u00a0X et\u00a0al. Chen\u00a0S,","year":"2024","unstructured":"Li\u00a0X et\u00a0al. Chen\u00a0S, Bhalgat\u00a0Y. 2024. Neural refinement for absolute pose regression with feature synthesis. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_12_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Cavallari\u00a0T Prisacariu V A et\u00a0al. Chen\u00a0S,","year":"2024","unstructured":"Prisacariu V A et\u00a0al. Chen\u00a0S, Cavallari\u00a0T. 2024. Map-relative pose regression for visual re-localization. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_13_2","volume-title":"European Conference on Computer Vision(ECCV)","author":"Li\u00a0X Wang Z et\u00a0al. Chen\u00a0S,","year":"2022","unstructured":"Wang Z et\u00a0al. Chen\u00a0S, Li\u00a0X. 2022. Dfnet: Enhance absolute pose regression with direct feature matching. In European Conference on Computer Vision(ECCV)."},{"key":"e_1_3_3_1_14_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"A\u00a0X Savva M et\u00a0al. Dai\u00a0A, Chang","year":"2017","unstructured":"Savva M et\u00a0al. Dai\u00a0A, Chang A\u00a0X. 2017. Scannet: Richly-annotated 3d reconstructions of indoor scenes. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_15_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Malisiewicz\u00a0T Rabinovich\u00a0A. DeTone\u00a0D,","year":"2018","unstructured":"Rabinovich\u00a0A. DeTone\u00a0D, Malisiewicz\u00a0T. 2018. Superpoint: Self-supervised interest point detection and description. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01380"},{"key":"e_1_3_3_1_17_2","volume-title":"IEEE\/CVF International Conference on Computer Vision(ICCV)","author":"Wang\u00a0Z Sun J et\u00a0al. Ding\u00a0M,","year":"2019","unstructured":"Sun J et\u00a0al. Ding\u00a0M, Wang\u00a0Z. 2019. CamNet: Coarse-to-fine retrieval for camera re-localization. In IEEE\/CVF International Conference on Computer Vision(ICCV)."},{"key":"e_1_3_3_1_18_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Wang\u00a0S Liu S et\u00a0al. Dong\u00a0S,","year":"2025","unstructured":"Liu S et\u00a0al. Dong\u00a0S, Wang\u00a0S. 2025. Reloc3r: Large-scale training of relative camera pose regression for generalizable, fast, and accurate visual localization. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_19_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Sun\u00a0Q B\u00f6kman G et\u00a0al. Edstedt\u00a0J,","year":"2024","unstructured":"B\u00f6kman G et\u00a0al. Edstedt\u00a0J, Sun\u00a0Q. 2024. Roma: Robust dense feature matching. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_20_2","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision(ICCV)","author":"G. Lowe\u00a0D","year":"1999","unstructured":"Lowe\u00a0D G.1999. Object recognition from local scale-invariant features. In Proceedings of the IEEE\/CVF International Conference on Computer Vision(ICCV)."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.336"},{"key":"e_1_3_3_1_22_2","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision(ICCV)","author":"Petrovi\u0107\u00a0L Markovi\u0107 I et\u00a0al. Koledi\u0107\u00a0K,","year":"2025","unstructured":"Markovi\u0107 I et\u00a0al. Koledi\u0107\u00a0K, Petrovi\u0107\u00a0L. 2025. Gvdepth: Zero-shot monocular depth estimation for ground vehicles based on probabilistic cue fusion. In Proceedings of the IEEE\/CVF International Conference on Computer Vision(ICCV)."},{"key":"e_1_3_3_1_23_2","volume-title":"IEEE international conference on computer vision workshops(ICCVW)","author":"Melekhov\u00a0I Kalia S et\u00a0al. Laskar\u00a0Z,","year":"2017","unstructured":"Kalia S et\u00a0al. Laskar\u00a0Z, Melekhov\u00a0I. 2017. Camera relocalization by computing pairwise relative poses using convolutional neural network. In IEEE international conference on computer vision workshops(ICCVW)."},{"key":"e_1_3_3_1_24_2","volume-title":"European Conference on Computer Vision. Cham: Springer Nature Switzerland(ECCV)","author":"Cabon\u00a0Y Revaud\u00a0J. Leroy\u00a0V,","year":"2024","unstructured":"Revaud\u00a0J. Leroy\u00a0V, Cabon\u00a0Y. 2024. Grounding image matching in 3d with mast3r. In European Conference on Computer Vision. Cham: Springer Nature Switzerland(ECCV)."},{"key":"e_1_3_3_1_25_2","volume-title":"European Conference on Computer Vision(ECCV)","author":"Gu\u00a0J Wu\u00a0B et\u00a0al. Lin\u00a0J,","year":"2024","unstructured":"Wu\u00a0B et\u00a0al. Lin\u00a0J, Gu\u00a0J. 2024. Learning neural volumetric pose features for camera localization. In European Conference on Computer Vision(ECCV)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01616"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00458"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00113"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_3_1_30_2","volume-title":"International Conference on Learning Representations(ICLR)","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled Weight Decay Regularization. In International Conference on Learning Representations(ICLR)."},{"key":"e_1_3_3_1_31_2","volume-title":"Conference on Robot Learning(CoRL)","author":"Piasco\u00a0N Tsishkou D et\u00a0al. Moreau\u00a0A,","year":"2022","unstructured":"Tsishkou D et\u00a0al. Moreau\u00a0A, Piasco\u00a0N. 2022. Lens: Localization enhanced by nerf synthesis. In Conference on Robot Learning(CoRL)."},{"key":"e_1_3_3_1_32_2","unstructured":"Moutakanni T et\u00a0al. Oquab\u00a0M Darcet\u00a0T. 2023. Dinov2: Learning robust visual features without supervision. arXiv:https:\/\/arXiv.org\/abs\/2304.07193 (2023)."},{"key":"e_1_3_3_1_33_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Y\u00a0H Sakaridis C et\u00a0al. Piccinelli\u00a0L, Yang","year":"2024","unstructured":"Sakaridis C et\u00a0al. Piccinelli\u00a0L, Yang Y\u00a0H. 2024. UniDepth: Universal monocular metric depth estimation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Ren\u00e9 Ranftl Katrin Lasinger David Hafner Konrad Schindler and Vladlen Koltun. 2022. Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-Shot Cross-Dataset Transfer. IEEE Transactions on Pattern Analysis and Machine Intelligence(TPAMI) 44 3 (2022).","DOI":"10.1109\/TPAMI.2020.3019967"},{"key":"e_1_3_3_1_35_2","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision(ICCV)","author":"Bochkovskiy\u00a0A Koltun\u00a0V. Ranftl\u00a0R,","year":"2021","unstructured":"Koltun\u00a0V. Ranftl\u00a0R, Bochkovskiy\u00a0A. 2021. Vision transformers for dense prediction. In Proceedings of the IEEE\/CVF International Conference on Computer Vision(ICCV)."},{"key":"e_1_3_3_1_36_2","unstructured":"Jawahar C\u00a0V. Saha\u00a0S Varma\u00a0G. 2018. Improved visual relocalization by discovering anchor points. arXiv:https:\/\/arXiv.org\/abs\/1811.04370 (2018)."},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00499"},{"key":"e_1_3_3_1_38_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Glocker\u00a0B Zach C et\u00a0al. Shotton\u00a0J,","year":"2013","unstructured":"Zach C et\u00a0al. Shotton\u00a0J, Glocker\u00a0B. 2013. Scene coordinate regression forests for camera relocalization in RGB-D images. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_39_2","unstructured":"Jiaming Sun Zehong Shen Yuang Wang Hujun Bao and Xiaowei Zhou. 2021. LoFTR: Detector-Free Local Feature Matching with Transformers. (2021)."},{"key":"e_1_3_3_1_40_2","unstructured":"Pengju Sun Banglei Guan Zhenbao Yu Yang Shang Qifeng Yu and D\u00e1niel Bar\u00e1th. 2025. Learning Affine Correspondences by Integrating Geometric Constraints. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2025)."},{"key":"e_1_3_3_1_41_2","volume-title":"IEEE International Conference on Robotics and Automation (ICRA)","author":"Brachmann\u00a0E Schindler K et\u00a0al. Turkoglu M\u00a0O,","year":"2021","unstructured":"Schindler K et\u00a0al. Turkoglu M\u00a0O, Brachmann\u00a0E. 2021. Visual camera re-localization using graph neural networks and relative pose supervision. In IEEE International Conference on Robotics and Automation (ICRA)."},{"key":"e_1_3_3_1_42_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Tyszkiewicz Micha\u0142","year":"2020","unstructured":"Micha\u0142 Tyszkiewicz, Pascal Fua, and Eduard Trulls. 2020. DISK: Learning local features with policy gradient. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00496"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02047"},{"key":"e_1_3_3_1_45_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Chen\u00a0M Karaev N et\u00a0al. Wang\u00a0J,","year":"2025","unstructured":"Karaev N et\u00a0al. Wang\u00a0J, Chen\u00a0M. 2025. Vggt: Visual geometry grounded transformer. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_46_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Leroy\u00a0V Cabon Y et\u00a0al. Wang\u00a0S,","year":"2024","unstructured":"Cabon Y et\u00a0al. Wang\u00a0S, Leroy\u00a0V. 2024. Dust3r: Geometric 3d vision made easy. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_47_2","volume-title":"IEEE International Conference on Robotics and Automation(ICRA)","author":"Denninger\u00a0M Triebel\u00a0R. Winkelbauer\u00a0D,","year":"2021","unstructured":"Triebel\u00a0R. Winkelbauer\u00a0D, Denninger\u00a0M. 2021. Learning to localize in new environments from synthetic training data. In IEEE International Conference on Robotics and Automation(ICRA)."},{"key":"e_1_3_3_1_48_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Kang\u00a0B Huang Z et\u00a0al. Yang\u00a0L,","year":"2024","unstructured":"Huang Z et\u00a0al. Yang\u00a0L, Kang\u00a0B. 2024. Depth anything: Unleashing the power of large-scale unlabeled data. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_49_2","unstructured":"Huang Z et\u00a0al. Yang\u00a0L Kang\u00a0B. 2024. Depth anything v2. Advances in Neural Information Processing Systems(NeurIPS) (2024)."},{"key":"e_1_3_3_1_50_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Liu\u00a0S Pautrat R et\u00a0al. Yu\u00a0Y,","year":"2025","unstructured":"Pautrat R et\u00a0al. Yu\u00a0Y, Liu\u00a0S. 2025. Relative pose estimation through affine corrections of monocular depth priors. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_51_2","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)","author":"Yang\u00a0L Yang T et\u00a0al. Zhang\u00a0Z,","year":"2025","unstructured":"Yang T et\u00a0al. Zhang\u00a0Z, Yang\u00a0L. 2025. StableDepth: Scene-Consistent and Scale-Invariant Monocular Depth. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR)."},{"key":"e_1_3_3_1_52_2","volume-title":"IEEE International Conference on Robotics and Automation (ICRA)","author":"Sattler\u00a0T Pollefeys M et\u00a0al. Zhou\u00a0Q,","year":"2020","unstructured":"Pollefeys M et\u00a0al. Zhou\u00a0Q, Sattler\u00a0T. 2020. To learn or not to learn: Visual localization from essential matrices. In IEEE International Conference on Robotics and Automation (ICRA)."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:02:19Z","timestamp":1781535739000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810594"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":51,"alternative-id":["10.1145\/3805622.3810594","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810594","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}