{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T23:02:14Z","timestamp":1779922934546,"version":"3.53.1"},"reference-count":46,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004826","name":"Beijing Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.neunet.2026.109141","type":"journal-article","created":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T15:13:30Z","timestamp":1779376410000},"page":"109141","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["EndoUFM: Utilizing foundation models for monocular depth estimation of endoscopic images"],"prefix":"10.1016","volume":"203","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0799-4136","authenticated-orcid":false,"given":"Xinning","family":"Yao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8497-8772","authenticated-orcid":false,"given":"Bo","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bojian","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jingjing","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jinghua","family":"Yue","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fugen","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109141_bib0001","series-title":"Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (volume 1: Long papers)","first-page":"7319","article-title":"Intrinsic dimensionality explains the effectiveness of language model fine-tuning","author":"Aghajanyan","year":"2021"},{"key":"10.1016\/j.neunet.2026.109141_bib0002","unstructured":"Allan, M., Mcleod, J., Wang, C., Rosenthal, J. C., Hu, Z., Gard, N., Eisert, P., Fu, K. X., Zeffiro, T., Xia, W. et al. (2021). Stereo correspondence and reconstruction of endoscopic data challenge. arXiv preprint arXiv: 2101.01133."},{"key":"10.1016\/j.neunet.2026.109141_bib0003","series-title":"2022\u202fIEEE\/RSJ International conference on intelligent robots and systems (IROS)","first-page":"4904","article-title":"Photometric single-view dense 3D reconstruction in endoscopy","author":"Batlle","year":"2022"},{"key":"10.1016\/j.neunet.2026.109141_bib0004","series-title":"Advances in neural information processing systems","article-title":"Unsupervised scale-consistent depth and ego-motion learning from monocular video","volume":"32","author":"Bian","year":"2019"},{"issue":"6","key":"10.1016\/j.neunet.2026.109141_bib0005","doi-asserted-by":"crossref","first-page":"1013","DOI":"10.1007\/s11548-024-03083-5","article-title":"Surgical-dino: Adapter learning of foundation models for depth estimation in endoscopic surgery","volume":"19","author":"Cui","year":"2024","journal-title":"International Journal of Computer Assisted Radiology and Surgery"},{"key":"10.1016\/j.neunet.2026.109141_bib0006","series-title":"International conference on medical image computing and computer-assisted intervention","first-page":"208","article-title":"Endodac: Efficient adapting foundation model for self-supervised depth estimation from any endoscopic camera","author":"Cui","year":"2024"},{"key":"10.1016\/j.neunet.2026.109141_bib0007","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2021.102302","article-title":"Serv-CT: A disparity dataset from cone-beam CT for validation of endoscopis 3D reconstruction","volume":"76","author":"Edwards","year":"2022","journal-title":"Medical Image Analysis"},{"key":"10.1016\/j.neunet.2026.109141_bib0008","series-title":"International conference on learning representations","article-title":"Training batchnorm and only batchnorm: On the expressive power of random features in {CNN}s","author":"Frankle","year":"2021"},{"key":"10.1016\/j.neunet.2026.109141_bib0009","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"3828","article-title":"Digging into self-supervised monocular depth estimation","author":"Godard","year":"2019"},{"key":"10.1016\/j.neunet.2026.109141_bib0010","unstructured":"Han, J. J., Acar, A., Henry, C., & Wu, J. Y. (2024). Depth anything in medical images: A comparative study. arXiv preprint arXiv: 2401.16600."},{"key":"10.1016\/j.neunet.2026.109141_bib0011","series-title":"International conference on machine learning","first-page":"17783","article-title":"LoRA+: Efficient low rank adaptation of large models","author":"Hayou","year":"2024"},{"key":"10.1016\/j.neunet.2026.109141_bib0012","series-title":"Proceedings of the IEEE international conference on computer vision","first-page":"1026","article-title":"Delving deep into rectifiers: Surpassing human-level performance on imagenet classification","author":"He","year":"2015"},{"key":"10.1016\/j.neunet.2026.109141_bib0013","series-title":"International conference on learning representations","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2022"},{"issue":"4","key":"10.1016\/j.neunet.2026.109141_bib0014","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3592433","article-title":"3D Gaussian splatting for real-time radiance field rendering","volume":"42","author":"Kerbl","year":"2023","journal-title":"ACM Transactions on Graphics"},{"key":"10.1016\/j.neunet.2026.109141_bib0015","series-title":"3rd International conference on learning representa tions, ICLR 2015, conferencetrackproceedings","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2015"},{"key":"10.1016\/j.neunet.2026.109141_bib0016","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"4015","article-title":"Segment anything","author":"Kirillov","year":"2023"},{"key":"10.1016\/j.neunet.2026.109141_bib0017","series-title":"12th International conference on learning representations","article-title":"VeRA: Vector-based random matrix adaptation","author":"Kopiczko","year":"2024"},{"key":"10.1016\/j.neunet.2026.109141_bib0018","article-title":"Intrinsic image decomposition as two independent deconvolution problems","volume":"86","author":"Krebs","year":"2020","journal-title":"Signal Processing: Image Communication"},{"issue":"12","key":"10.1016\/j.neunet.2026.109141_bib0019","doi-asserted-by":"crossref","first-page":"9084","DOI":"10.1109\/JBHI.2024.3400804","article-title":"Image intrinsic-based unsupervised monocular depth estimation in endoscopy","volume":"29","author":"Li","year":"2025","journal-title":"IEEE Journal of Biomedical and Health Informatics"},{"key":"10.1016\/j.neunet.2026.109141_bib0020","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"6197","article-title":"Revisiting stereo depth estimation from a sequence-to-sequence perspective with transformers","author":"Li","year":"2021"},{"key":"10.1016\/j.neunet.2026.109141_bib0021","series-title":"2024\u202fIEEE\/RSJ International conference on intelligent robots and systems (IROS)","first-page":"7160","article-title":"Self-supervised monocular depth estimation with effective feature fusion and self distillation","author":"Liu","year":"2024"},{"key":"10.1016\/j.neunet.2026.109141_bib0022","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"7628","article-title":"Frozen pretrained transformers as universal computation engines","volume":"36","author":"Lu","year":"2022"},{"issue":"1","key":"10.1016\/j.neunet.2026.109141_bib0023","doi-asserted-by":"crossref","first-page":"654","DOI":"10.1038\/s41467-024-44824-z","article-title":"Segment anything in medical images","volume":"15","author":"Ma","year":"2024","journal-title":"Nature Communications"},{"issue":"1","key":"10.1016\/j.neunet.2026.109141_bib0024","doi-asserted-by":"crossref","first-page":"99","DOI":"10.1145\/3503250","article-title":"NeRF: Representing scenes as neural radiance fields for view synthesis","volume":"65","author":"Mildenhall","year":"2021","journal-title":"Communications of the ACM"},{"issue":"4","key":"10.1016\/j.neunet.2026.109141_bib0025","doi-asserted-by":"crossref","first-page":"14","DOI":"10.1109\/MSP.2010.936728","article-title":"Three-dimensional tissue deformation recovery and tracking","volume":"27","author":"Mountney","year":"2010","journal-title":"IEEE Signal Processing Magazine"},{"key":"10.1016\/j.neunet.2026.109141_bib0026","article-title":"DINOv2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Transactions on Machine Learning Research Journal"},{"key":"10.1016\/j.neunet.2026.109141_bib0027","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2021.102058","article-title":"EndoSLAM dataset and an unsupervised monocular visual odometry and depth estimation approach for endoscopic videos","volume":"71","author":"Ozyoruk","year":"2021","journal-title":"Medical Image Analysis"},{"key":"10.1016\/j.neunet.2026.109141_bib0028","series-title":"Advances in neural information processing systems","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019"},{"issue":"10","key":"10.1016\/j.neunet.2026.109141_bib0029","doi-asserted-by":"crossref","first-page":"810","DOI":"10.1007\/s11760-025-04389-w","article-title":"Hadepth: Highlight-aware monocular depth estimation for endoscopy","volume":"19","author":"Peng","year":"2025","journal-title":"Signal, Image and Video Processing"},{"key":"10.1016\/j.neunet.2026.109141_bib0030","series-title":"International conference on machine learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"issue":"3","key":"10.1016\/j.neunet.2026.109141_bib0031","doi-asserted-by":"crossref","first-page":"1623","DOI":"10.1109\/TPAMI.2020.3019967","article-title":"Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer","volume":"44","author":"Ranftl","year":"2020","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"4","key":"10.1016\/j.neunet.2026.109141_bib0032","doi-asserted-by":"crossref","first-page":"7225","DOI":"10.1109\/LRA.2021.3095528","article-title":"Endo-depth-and-motion: Reconstruction and tracking in endoscopic videos using depth networks and photometric constraints","volume":"6","author":"Recasens","year":"2021","journal-title":"IEEE Robotics and Automation Letters"},{"key":"10.1016\/j.neunet.2026.109141_bib0033","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2021.102338","article-title":"Self-supervised monocular depth and ego-motion estimation in endoscopy: Appearance flow to the rescue","volume":"77","author":"Shao","year":"2022","journal-title":"Medical Image Analysis"},{"issue":"1","key":"10.1016\/j.neunet.2026.109141_bib0034","doi-asserted-by":"crossref","first-page":"105","DOI":"10.14245\/ns.2449404.702","article-title":"The utilization of navigation and emerging technologies with endoscopic spine surgery: A narrative review","volume":"22","author":"Sharma","year":"2025","journal-title":"Neurospine"},{"key":"10.1016\/j.neunet.2026.109141_bib0035","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"1328","article-title":"Joint depth prediction and semantic segmentation with multi-view sam","author":"Shvets","year":"2024"},{"key":"10.1016\/j.neunet.2026.109141_bib0036","doi-asserted-by":"crossref","first-page":"3517","DOI":"10.1109\/TMM.2023.3312950","article-title":"Unsupervised monocular estimation of depth and visual odometry using attention and depth-pose consistency loss","volume":"26","author":"Song","year":"2024","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109141_bib0037","unstructured":"Tian, Q., Chen, Z., Liao, H., Huang, X., Li, L., Ourselin, S., & Liu, H. (2024). Endoomni: Zero-shot cross-dataset depth estimation in endoscopy by robust self-learning from noisy labels. arXiv preprint arXiv: 2409.05442."},{"key":"10.1016\/j.neunet.2026.109141_bib0038","series-title":"International conference on medical image computing and computer-assisted intervention","first-page":"431","article-title":"Neural rendering for stereo 3D reconstruction of deformable tissues in robotic surgery","author":"Wang","year":"2022"},{"key":"10.1016\/j.neunet.2026.109141_bib0039","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2025.103534","article-title":"MonoPCC: Photometric-invariant cycle constraint for monocular depth estimation of endoscopic images","volume":"102","author":"Wang","year":"2025","journal-title":"Medical Image Analysis"},{"key":"10.1016\/j.neunet.2026.109141_bib0040","series-title":"Proceedings of the European conference on computer vision (ECCV)","first-page":"3","article-title":"Cbam: Convolutional block attention module","author":"Woo","year":"2018"},{"key":"10.1016\/j.neunet.2026.109141_bib0041","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"10371","article-title":"Depth anything: Unleashing the power of large-scale unlabeled data","author":"Yang","year":"2024"},{"key":"10.1016\/j.neunet.2026.109141_bib0042","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2024.106410","article-title":"Self-supervised endoscopy depth estimation framework with CLIP-guidance segmentation","volume":"95","author":"Yang","year":"2024","journal-title":"Biomedical Signal Processing and Control"},{"issue":"5","key":"10.1016\/j.neunet.2026.109141_bib0043","doi-asserted-by":"crossref","first-page":"1934","DOI":"10.1109\/TMI.2024.3352390","article-title":"Self-supervised lightweight depth estimation in endoscopy combining CNN and transformer","volume":"43","author":"Yang","year":"2024","journal-title":"IEEE Transactions on Medical Imaging"},{"key":"10.1016\/j.neunet.2026.109141_bib0044","series-title":"11th International conference on learning representations, ICLR 2023","article-title":"Adaptive budget allocation for parameter-efficient fine-tuning","author":"Zhang","year":"2023"},{"key":"10.1016\/j.neunet.2026.109141_bib0045","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2026.108598","article-title":"Multi-source temporal-depth fusion for robust end-to-end visual odometry","volume":"198","author":"Zhang","year":"2026","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109141_bib0046","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"1851","article-title":"Unsupervised learning of depth and ego-motion from video","author":"Zhou","year":"2017"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026006027?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026006027?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T22:06:47Z","timestamp":1779919607000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026006027"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":46,"alternative-id":["S0893608026006027"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109141","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"EndoUFM: Utilizing foundation models for monocular depth estimation of endoscopic images","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109141","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"109141"}}