{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:16:40Z","timestamp":1777889800504,"version":"3.51.4"},"reference-count":120,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100010663","name":"European Research Council (ERC) Advanced Grant SIMULACRON, DFG","doi-asserted-by":"publisher","award":["CR 250\/26-1"],"award-info":[{"award-number":["CR 250\/26-1"]}],"id":[{"id":"10.13039\/100010663","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001659","name":"Deutsche Forschungsgemeinschaft (German Research Foundation, DFG) under Germany's Excellence Strategy (EXC 3066\/1 \u201cThe Adaptive Mind\u201d)","doi-asserted-by":"publisher","award":["533717223"],"award-info":[{"award-number":["533717223"]}],"id":[{"id":"10.13039\/501100001659","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002347","name":"Federal Ministry of Education and Research","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002347","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100021154","name":"Amazon Research Award","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100021154","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00638","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"6784-6796","source":"Crossref","is-referenced-by-count":0,"title":["Feed-Forward SceneDINO for Unsupervised Semantic Scene Completion"],"prefix":"10.1109","author":[{"given":"Aleksandar","family":"Jevti\u0107","sequence":"first","affiliation":[{"name":"TU Munich"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Christoph","family":"Reich","sequence":"additional","affiliation":[{"name":"TU Munich"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Felix","family":"Wimbauer","sequence":"additional","affiliation":[{"name":"TU Munich"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Oliver","family":"Hahn","sequence":"additional","affiliation":[{"name":"TU Darmstadt"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Christian","family":"Rupprecht","sequence":"additional","affiliation":[{"name":"University of Oxford"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stefan","family":"Roth","sequence":"additional","affiliation":[{"name":"TU Darmstadt"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daniel","family":"Cremers","sequence":"additional","affiliation":[{"name":"TU Munich"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01513"},{"key":"ref2","article-title":"Self-labelling via simultaneous clustering and representation learning","author":"Markus Asano","year":"2020","journal-title":"ICLR"},{"key":"ref3","first-page":"15509","article-title":"Learning representations by maximizing mutual information across views","author":"Bachman","journal-title":"NeurIPS2019"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0640"},{"key":"ref5","article-title":"VICReg: Variance-invariance-covariance regularization for self-supervised learning","author":"Bardes","year":"2022","journal-title":"ICLR"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00039"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2021.3075644"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00396"},{"key":"ref9","first-page":"9912","article-title":"Unsupervised learning of visual features by contrasting cluster assignments","author":"Caron","journal-title":"NeurIPS 2020"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref12","article-title":"Improved baselines with momentum contrastive learning","author":"Chen","year":"2020","journal-title":"arXiv"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00425"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"ref15","first-page":"107064","article-title":"MVSplat 360: Feed-forward 360 scene synthesis from sparse views","author":"Chen","journal-title":"NeurIPS*2024"},{"key":"ref16","first-page":"2148","article-title":"S3CNet: A sparse semantic scene completion network for LiDAR point clouds","author":"Cheng","year":"2020","journal-title":"CoRL"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01652"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46723-8_49"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref20","article-title":"Cluster and predict latent patches for improved masked image modeling","author":"Darcet","year":"2025","journal-title":"arXiv"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"ref23","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021","journal-title":"ICLR"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3134634"},{"key":"ref25","article-title":"FeatUp: A model-agnostic framework for features at any resolution","author":"Fu","year":"2024","journal-title":"ICLR"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913491297"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.699"},{"key":"ref28","first-page":"21271","article-title":"Bootstrap your own latent: A new approach to self-supervised learning","author":"Grill","journal-title":"NeurIPS*2020"},{"key":"ref29","first-page":"40676","article-title":"Siamese Masked Autoencoders","author":"Gupta","journal-title":"NeurIPS*2023"},{"key":"ref30","first-page":"643","article-title":"Semantic abstraction: Openworld 3D scene understanding from 2D vision-language models","author":"Ha","year":"2023","journal-title":"CoRL"},{"key":"ref31","article-title":"Boosting unsupervised semantic segmentation with principal mask proposals","author":"Hahn","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02280"},{"key":"ref33","article-title":"Unsupervised semantic segmentation by distilling feature correspondences","author":"Hamilton","year":"2022","journal-title":"ICLR"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00939"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2954885"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-92659-5_2"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1017\/cbo9780511811685"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/3DV62453.2024.00133"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref41","first-page":"4182","article-title":"Data-efficient image pecognition with contrastive predictive coding","author":"Henaff","year":"2020","journal-title":"ICML"},{"key":"ref42","article-title":"Learning deep representations by mutual information estimation and maximization","author":"Hjelm","year":"2019","journal-title":"ICLR"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3444912"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72754-2_16"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00890"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01885"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1561\/0600000079"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00996"},{"key":"ref49","article-title":"GaussTR: Foundation model-aligned Gaussian transformer for self-supervised 3D spatial understanding","author":"Jiang","year":"2024","journal-title":"arXiv"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00338"},{"key":"ref52","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2015","journal-title":"ICLR"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1694"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00393"},{"key":"ref56","first-page":"109","article-title":"Efficient inference in fully connected CRFs with Gaussian edge potentials","author":"Kr\u00e4henb\u00fchl","journal-title":"NIPS*2011"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1002\/nav.3800020109"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00788"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00341"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2953639"},{"key":"ref61","article-title":"Prototypical contrastive learning of unsupervised representations","author":"Li","year":"2021","journal-title":"ICLR"},{"key":"ref62","article-title":"Semi-supervised implicit scene completion from sparse LiDAR","author":"Li","year":"2021","journal-title":"arXiv"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00877"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10802143"},{"key":"ref65","article-title":"FB-OCC: 3D occupancy prediction based on forward-backward view transformation","author":"Li","year":"2023","journal-title":"arXiv"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3179507"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref68","first-page":"261","article-title":"See and think: Disentangling semantic scene completion","author":"Liu","journal-title":"NeurIPS*2018"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1982.1056489"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1016\/j.aei.2018.05.005"},{"key":"ref71","first-page":"281","article-title":"Some methods for classification and analysis of multivariate observations","volume-title":"Berkeley Symp. on Math. Statist. and Prob.","author":"MacQueen","year":"1967"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/2945.468400"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/icra48891.2023.10160800"},{"key":"ref74","article-title":"Occdepth: A depth-aware method for 3D semantic scene completion","author":"Miao","year":"2023","journal-title":"arXiv"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.12.089"},{"key":"ref77","article-title":"R-MAE: Regions meet masked autoencoders","author":"Kien Nguyen","year":"2024","journal-title":"ICLR"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02146"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"ref80","article-title":"DINOv 2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-34141-0_16"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611537"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"ref85","article-title":"SAM 2: Segment anything in images and videos","author":"Ravi","year":"2024","journal-title":"arXiv"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00207"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3095302"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/3DV50981.2020.00021"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01504-5"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.445"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1145\/1772690.1772862"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01872"},{"key":"ref93","article-title":"CLIP-Fields: Weakly supervised semantic fields for robotic memory","volume-title":"ICRA Workshop on Pretraining for Robotics","author":"Muhammad","year":"2023"},{"key":"ref94","first-page":"405","article-title":"Distilled feature fields enable few-shot language-guided manipulation","author":"Shen","year":"2023","journal-title":"CoRL"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00349"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.28"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/3dv66043.2025.00067"},{"key":"ref98","first-page":"68367","article-title":"OpenMask3D: Open-vocabulary 3D instance segmentation","author":"Takmaz","journal-title":"NeurIPS*2023"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00772"},{"key":"ref101","article-title":"VL- Fields: Towards language-grounded neural implicit spatial representations","volume-title":"ICRA Workshop on Representations, Abstractions, and Priors for Robot Learning","author":"Tsagkas","year":"2023"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/3DV57658.2022.00056"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.30"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00305"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/3DV62453.2024.00075"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00876"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14505"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16419"},{"key":"ref111","article-title":"EmerneRF: Emergent spatial-temporal scene decomposition via self-supervision","author":"Yang","year":"2024","journal-title":"ICLR"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73013-9_26"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1016\/j.compbiomed.2024.108546"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00455"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72627-9_4"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00789"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00865"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201323"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1017\/S096249291700006X"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11446173.pdf?arnumber=11446173","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:21:46Z","timestamp":1777612906000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11446173\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":120,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00638","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}