{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,8]],"date-time":"2026-02-08T04:20:41Z","timestamp":1770524441723,"version":"3.49.0"},"reference-count":63,"publisher":"Tech Science Press","issue":"2","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2025]]},"DOI":"10.32604\/cmc.2025.063985","type":"journal-article","created":{"date-parts":[[2025,6,5]],"date-time":"2025-06-05T03:49:31Z","timestamp":1749095371000},"page":"3623-3648","source":"Crossref","is-referenced-by-count":1,"title":["Zero-Shot Based Spatial AI Algorithm for Up-to-Date 3D Vision Map Generations in Highly Complex Indoor Environments"],"prefix":"10.32604","volume":"84","author":[{"given":"Sehun","family":"Lee","sequence":"first","affiliation":[]},{"given":"Taehoon","family":"Kim","sequence":"additional","affiliation":[]},{"given":"Junho","family":"Ahn","sequence":"additional","affiliation":[]}],"member":"17807","published-online":{"date-parts":[[2025]]},"reference":[{"key":"ref1","doi-asserted-by":"crossref","first-page":"742","DOI":"10.1201\/9781003483755-86","author":"Luleci","year":"2024","journal-title":"Bridge maintenance, safety, management, digitalization and sustainability"},{"key":"ref2","series-title":"2016 IEEE International Conference on Robotics and Automation (ICRA)","first-page":"2583","article-title":"Active sensing data collection with autonomous mobile robots","volume":"2016","author":"Wang","year":"2016 May 16\u201321"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"477","DOI":"10.5194\/isprs-archives-XLIII-B2-2022-477-2022","article-title":"Application of stereo cameras with wide-angle lenses for the indoor mapping","volume":"XLIII\u2013B2\u20132022","author":"Wierzbicki","year":"2022","journal-title":"Int Arch Photogramm Remote Sens Spatial Inf Sci"},{"key":"ref4","doi-asserted-by":"crossref","first-page":"67219","DOI":"10.1109\/ACCESS.2022.3185732","article-title":"PlaneLoc2: indoor global localization using planar segments and passive stereo camera","volume":"10","author":"Wietrzykowski","year":"2022","journal-title":"IEEE Access"},{"key":"ref5","series-title":"2023 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","first-page":"1366","article-title":"Constructing metric-semantic maps using floor plan priors for long-term indoor localization","volume":"2023","author":"Zimmerman","year":"2023 Oct 1\u20135"},{"key":"ref6","series-title":"2023 IEEE International Conference on Robotics and Automation (ICRA)","first-page":"8371","article-title":"SHINE-mapping: large-scale 3D mapping using sparse hierarchical implicit neural representations","volume":"2023","author":"Zhong","year":"2023 May 29\u2013Jun 2"},{"key":"ref7","series-title":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"21584","article-title":"Photo-SLAM: real-time simultaneous localization and photorealistic mapping for monocular, stereo, and RGB-D cameras","volume":"2024","author":"Huang","year":"2024 Jun 16\u201322"},{"key":"ref8","doi-asserted-by":"crossref","first-page":"025203","DOI":"10.1088\/1361-6501\/ac9ed0","article-title":"Fusion of binocular vision, 2D lidar and IMU for outdoor localization and indoor planar mapping","volume":"34","author":"Liu","year":"2023","journal-title":"Meas Sci Technol"},{"key":"ref9","unstructured":"Bommasani R, Hudson DA, Adeli E, Altman R, Arora S, von Arx S, et al. On the opportunities and risks of foundation models. arXiv:2108.07258. 2021."},{"key":"ref10","doi-asserted-by":"crossref","first-page":"2245","DOI":"10.1109\/TPAMI.2024.3506283","article-title":"Foundation models defining a new era in vision: a survey and outlook","volume":"47","author":"Awais","year":"2025","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref11","series-title":"2023 IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"3992","article-title":"Segment anything","volume":"2023","author":"Kirillov","year":"2023 Oct 1\u20136"},{"key":"ref12","unstructured":"Ravi N, Gabeur V, Hu YT, Hu R, Ryali C, Ma T, et al. Sam 2: segment anything in images and videos. arXiv:2408.00714. 2024."},{"key":"ref13","unstructured":"Yang CY, Huang HW, Chai W, Jiang Z, Hwang JN. SAMURAI: adapting segment anything model for zero-shot visual tracking with motion-aware memory. arXiv:2411.11922. 2024."},{"key":"ref14","series-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"14408","article-title":"InternImage: exploring large-scale vision foundation models with deformable convolutions","volume":"2023","author":"Wang","year":"2023 Jun 17\u201324"},{"key":"ref15","series-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"15617","article-title":"FLAVA: a foundational language and vision alignment model","volume":"2022","author":"Singh","year":"2022 Jun 18\u201324"},{"key":"ref16","unstructured":"Yuan L, Chen D, Chen YL, Codella N, Dai X, Gao J, et al. Florence: a new foundation model for computer vision. arXiv:2111.11432. 2021."},{"key":"ref17","series-title":"Proceedings of the 38th International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume":"139","author":"Radford","year":"2021"},{"key":"ref18","series-title":"2015 IEEE International Conference on Computer Vision (ICCV)","first-page":"2425","article-title":"VQA: visual question answering","volume":"2015","author":"Antol","year":"2015 Dec 7\u201313"},{"key":"ref19","series-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10910","article-title":"@ CREPE: can vision-language foundation models reason compositionally?","volume":"2023","author":"Ma","year":"2023 Jun 17\u201324"},{"key":"ref20","series-title":"Proceedings of the 38th International Conference on Machine Learning","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision. International conference on machine learning","volume":"139","author":"Jia","year":"2021"},{"key":"ref21","unstructured":"Foutter M, Bhoj P, Sinha R, Elhafsi A, Banerjee S, Agia C, et al. Adapting a foundation model for space-based tasks. arXiv:2408.05924. 2024."},{"key":"ref22","first-page":"5607315","article-title":"Advancing plain vision transformer toward remote sensing foundation model","volume":"61","author":"Wang","year":"2022","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"ref23","doi-asserted-by":"crossref","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","author":"Yu","year":"2016","journal-title":"Computer vision\u2013ECCV 2016"},{"key":"ref24","doi-asserted-by":"crossref","first-page":"59","DOI":"10.1007\/978-3-030-58607-2_4","author":"Hui","year":"2020","journal-title":"Computer vision\u2013ECCV 2020"},{"key":"ref25","series-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10031","article-title":"Multi-task collaborative network for joint referring expression comprehension and segmentation","volume":"2020","author":"Luo","year":"2020 Jun 13\u201319"},{"key":"ref26","doi-asserted-by":"crossref","first-page":"7900","DOI":"10.1109\/TPAMI.2022.3217852","article-title":"VLT: vision-language transformer and query generation for referring segmentation","volume":"45","author":"Ding","year":"2023","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref27","series-title":"2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"15501","article-title":"Encoder fusion network with co-attention embedding for referring image segmentation","volume":"2021","author":"Feng","year":"2021 Jun 20\u201325"},{"key":"ref28","article-title":"Language-aware vision transformer for referring segmentation","author":"Yang","year":"2024","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref29","series-title":"2023 IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"22233","article-title":"SimpleClick: interactive image segmentation with simple vision transformers","author":"Liu","year":"2023 Oct 1\u20136"},{"key":"ref30","author":"Xu","year":"2024","journal-title":"European Conference on Artificial Intelligence ECAI 2024"},{"key":"ref31","series-title":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3459","article-title":"Universal segmentation at arbitrary granularity with language instruction","volume":"2024","author":"Liu","year":"2024 Jun 16\u201322"},{"key":"ref32","series-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"9579","article-title":"LISA: reasoning segmentation via large language model","author":"Lai","year":"2024"},{"key":"ref33","series-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"18653","article-title":"PolyFormer: referring image segmentation as sequential polygon generation","volume":"2023","author":"Liu","year":"2023 Jun 17\u201324"},{"key":"ref34","first-page":"74","author":"Zhang","year":"2024","journal-title":"Computer vision\u2013ECCV 2024"},{"key":"ref35","series-title":"2023 IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"2538","article-title":"Segment every reference object in spatial and temporal spaces","volume":"2023","author":"Wu","year":"2023 Oct 1\u20136"},{"key":"ref36","unstructured":"Zhang Y, Cheng T, Zhu L, Hu R, Liu L, Liu H, et al. EVF-SAM: early vision-language fusion for text-prompted segment anything model. arXiv:2406.20076. 2024."},{"key":"ref37","series-title":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"13009","article-title":"GLaMM: pixel grounding large multimodal model","volume":"2024","author":"Rasheed","year":"2024 Jun 16\u201322"},{"key":"ref38","series-title":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"26364","article-title":"PixelLM: pixel reasoning with large multimodal model","volume":"2024","author":"Ren","year":"2024 Jun 16\u201322"},{"key":"ref39","doi-asserted-by":"crossref","first-page":"485","DOI":"10.1007\/978-3-030-31321-0_42","author":"Lafuente-Arroyo","year":"2019","journal-title":"Pattern recognition and image analysis"},{"key":"ref40","doi-asserted-by":"crossref","first-page":"33","DOI":"10.56578\/ataiml020104","article-title":"Floor segmentation approach using FCM and CNN","volume":"2","author":"Ravishankar","year":"2023","journal-title":"Acadlore Trans AI Mach Learn"},{"key":"ref41","doi-asserted-by":"crossref","first-page":"192","DOI":"10.1016\/j.ins.2021.06.006","article-title":"Cooperative indoor 3D mapping and modeling using LiDAR data","volume":"574","author":"Wen","year":"2021","journal-title":"Inf Sci"},{"key":"ref42","series-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"6188","article-title":"StyleMesh: style transfer for indoor 3D scene reconstructions","volume":"2022","author":"H\u00f6llein","year":"2022 Jun 18\u201324"},{"key":"ref43","doi-asserted-by":"crossref","first-page":"104174","DOI":"10.1016\/j.cviu.2024.104174","article-title":"Transformer fusion for indoor RGB-D semantic segmentation","volume":"249","author":"Wu","year":"2024","journal-title":"Comput Vis Image Underst"},{"key":"ref44","doi-asserted-by":"crossref","first-page":"108901","DOI":"10.1016\/j.jobe.2024.108901","article-title":"A BIM-enabled digital twin framework for real-time indoor environment monitoring and visualization by integrating autonomous robotics, LiDAR-based 3D mobile mapping, IoT sensing, and indoor positioning technologies","volume":"86","author":"Hu","year":"2024","journal-title":"J Build Eng"},{"key":"ref45","first-page":"5777","article-title":"3D instance segmentation using deep learning on RGB-D indoor data","volume":"72","author":"Muhammad Yasir","year":"2022","journal-title":"Comput Mater Contin"},{"key":"ref46","series-title":"2024 IEEE International Conference on Robotics and Automation (ICRA)","first-page":"15988","article-title":"Language-EXtended indoor SLAM (LEXIS): a versatile system for real-time visual scene understanding","volume":"2024","author":"Kassab","year":"2024 May 13\u201317"},{"key":"ref47","doi-asserted-by":"crossref","first-page":"1567","DOI":"10.1109\/TMM.2020.3001500","article-title":"V-eye: a vision-based navigation system for the visually impaired","volume":"23","author":"Duh","year":"2020","journal-title":"IEEE Trans Multimed"},{"key":"ref48","series-title":"2020 IEEE International Conference on Robotics and Automation (ICRA)","first-page":"9673","article-title":"Hybrid topological and 3D dense mapping through autonomous exploration for large indoor environments","volume":"2020","author":"Gomez","year":"2020 May 31\u2013Aug 31"},{"key":"ref49","doi-asserted-by":"crossref","first-page":"254","DOI":"10.1016\/j.isprsjprs.2021.07.002","article-title":"Automatic voxel-based 3D indoor reconstruction and room partitioning from triangle meshes","volume":"181","author":"H\u00fcbner","year":"2021","journal-title":"ISPRS J Photogramm Remote Sens"},{"key":"ref50","first-page":"2765","article-title":"Vision-based recognition algorithm for up-to-date indoor digital map generations at damaged buildings","volume":"72","author":"Kim","year":"2022","journal-title":"Comput Mater Contin"},{"key":"ref51","doi-asserted-by":"crossref","first-page":"2315","DOI":"10.32604\/iasc.2023.034394","article-title":"Intelligent risk-identification algorithm with vision and 3D LiDAR patterns at damaged buildings","volume":"36","author":"Kim","year":"2023","journal-title":"Intell Autom Soft Comput"},{"key":"ref52","doi-asserted-by":"crossref","first-page":"273","DOI":"10.1023\/A:1022627411411","article-title":"Support-vector networks","volume":"20","author":"Cortes","year":"1995","journal-title":"Mach Learn"},{"key":"ref53","series-title":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10371","article-title":"Depth anything: unleashing the power of large-scale unlabeled data","volume":"2024","author":"Yang","year":"2024 Jun 16\u201322"},{"key":"ref54","doi-asserted-by":"crossref","first-page":"679","DOI":"10.1109\/TPAMI.1986.4767851","article-title":"A computational approach to edge detection","volume":"PAMI-8","author":"Canny","year":"1986","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref55","unstructured":"Omorobot. omorobot.com\/r1v2 [Internet]. Ansan, Kyonggi, Republic of Korea [cited 2025 Jan 21]. Available from: https:\/\/www.omorobot.com\/r1v2\/."},{"key":"ref56","unstructured":"Samsung. samsung.com\/sec\/mobile [Internet]. Suwon, Kyonggi, Republic of Korea [cited 2025 Jan 21]. Available from: https:\/\/www.samsung.com\/sec\/mobile\/."},{"key":"ref57","unstructured":"Massachusetts Institute of Technology. Indoor Scene Recognition [Internet]. Cambridge, MA, USA; 2009 [cited 2025 Jan 21]. Available from: https:\/\/web.mit.edu\/torralba\/www\/indoor.html2009."},{"key":"ref58","doi-asserted-by":"crossref","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","author":"Lin","year":"2014","journal-title":"Computer vision\u2013ECCV 2014"},{"key":"ref59","series-title":"2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"5351","article-title":"LVIS: a dataset for large vocabulary instance segmentation","volume":"2019","author":"Gupta","year":"2019 Jun 15\u201320"},{"key":"ref60","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, et al. An image is worth 16 \u00d7 16 words: transformers for image recognition at scale. arXiv:2010.11929. 2020."},{"key":"ref61","unstructured":"Meta. SA-V Dataset [Internet]. Menlo Park, CA, USA; 2024 [cited 2025 Jan 21]. Available from: https:\/\/ai.meta.com\/datasets\/segment-anything-video\/."},{"key":"ref62","series-title":"2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"9992","article-title":"Swin transformer: hierarchical vision transformer using shifted windows","volume":"2021","author":"Liu","year":"2021 Oct 10\u201317"},{"key":"ref63","series-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","author":"Denvlin","year":"2019"}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/cdn.techscience.cn\/files\/cmc\/2025\/TSP_CMC-84-2\/TSP_CMC_63985\/TSP_CMC_63985.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T01:49:13Z","timestamp":1763344153000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v84n2\/62881"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":63,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2025]]},"published-print":{"date-parts":[[2025]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2025.063985","relation":{},"ISSN":["1546-2226"],"issn-type":[{"value":"1546-2226","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}