{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T04:58:54Z","timestamp":1781585934046,"version":"3.54.5"},"reference-count":99,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2022,9,20]],"date-time":"2022-09-20T00:00:00Z","timestamp":1663632000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,9,20]],"date-time":"2022-09-20T00:00:00Z","timestamp":1663632000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2022,12]]},"DOI":"10.1007\/s11263-022-01670-0","type":"journal-article","created":{"date-parts":[[2022,9,20]],"date-time":"2022-09-20T09:04:03Z","timestamp":1663664643000},"page":"2940-2961","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":27,"title":["Scene Reconstruction with Functional Objects for Robot Autonomy"],"prefix":"10.1007","volume":"130","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2649-4577","authenticated-orcid":false,"given":"Muzhi","family":"Han","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8929-134X","authenticated-orcid":false,"given":"Zeyu","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3404-3810","authenticated-orcid":false,"given":"Ziyuan","family":"Jiao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xu","family":"Xie","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7024-1545","authenticated-orcid":false,"given":"Yixin","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1925-5973","authenticated-orcid":false,"given":"Song-Chun","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3003-8611","authenticated-orcid":false,"given":"Hangxin","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,9,20]]},"reference":[{"key":"1670_CR1","unstructured":"Agin, G. J. and Binford T. O. 1973 \u201cComputer description of curved objects.\u201d International Joint Conference on Artificial Intelligence (IJCAI)."},{"key":"1670_CR2","doi-asserted-by":"crossref","unstructured":"Armeni, I., He, Z. Y., Gwak, J., Zamir, A. R., Fischer, M., Malik, J., & Savarese S. (2019). 3d scene graph: A structure for unified semantics, 3d space, and camera. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/ICCV.2019.00576"},{"key":"1670_CR3","doi-asserted-by":"crossref","unstructured":"Avetisyan, A., Dahnert, M., Dai, A., Savva, M., Chang, A. X., & Nie\u00dfner, M. (2019a). Scan2cad: Learning cad model alignment in rgb-d scans. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2019.00272"},{"key":"1670_CR4","doi-asserted-by":"crossref","unstructured":"Avetisyan, A., Dai, A., & Nie\u00dfner, M. (2019b). End-to-end cad model retrieval and 9dof alignment in 3d scans. In International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2019.00264"},{"key":"1670_CR5","unstructured":"Batra, D., Chang, A. X., Chernova, S., Davison, A. J., Deng, J., Koltun, V., Levine, S., Malik, J., Mordatch, I., & Mottaghi R. et\u00a0al. (2020). Rearrangement: A challenge for embodied ai. arXiv preprint arXiv:2011.01975"},{"key":"1670_CR6","doi-asserted-by":"crossref","unstructured":"Cadena, C., Carlone, L., Carrillo, H., Latif, Y., Scaramuzza, D., Neira, J., et\u00a0al. (2016). Past, present, and future of simultaneous localization and mapping: Toward the robust-perception age. IEEE Transactions on Robotics (T-RO), 32(6), 1309\u20131332.","DOI":"10.1109\/TRO.2016.2624754"},{"key":"1670_CR7","doi-asserted-by":"crossref","unstructured":"Chang, A., Dai, A., Funkhouser, T., Halber, M., Niebner, M., Savva, M., Song, S., Zeng, A., & Zhang Y. (2017). Matterport3d: Learning from rgb-d data in indoor environments. In International Conference on 3D Vision (3DV).","DOI":"10.1109\/3DV.2017.00081"},{"key":"1670_CR8","unstructured":"Chang, A. X., Funkhouser, T., Guibas, L., Hanrahan, P., Huang, Q., Li, Z., Savarese, S., Savva, M., Song, S., & Su H. et\u00a0al. (2015). Shapenet: An information-rich 3d model repository. arXiv preprint arXiv:1512.03012"},{"issue":"9","key":"1670_CR9","doi-asserted-by":"publisher","first-page":"2165","DOI":"10.1109\/TPAMI.2017.2748579","volume":"40","author":"HJ Chang","year":"2017","unstructured":"Chang, H. J., & Demiris, Y. (2017). Highly articulated kinematic structure estimation combining motion and skeleton information. Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 40(9), 2165\u20132179.","journal-title":"Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"1670_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Y., Huang, S., Yuan, T., Qi, S., Zhu, Y., & Zhu, S. C. (2019). Holistic++ scene understanding: Single-view 3d holistic scene parsing and human pose estimation with human-object interaction and physical commonsense. In International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2019.00874"},{"key":"1670_CR11","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A. X., Savva, M., Halber, M., Funkhouser, T., & Nie\u00dfner, M. (2017). Scannet: Richly-annotated 3d reconstructions of indoor scenes. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2017.261"},{"key":"1670_CR12","doi-asserted-by":"crossref","unstructured":"Deitke, M., Han, W., Herrasti, A., Kembhavi, A., Kolve, E., Mottaghi, R., Salvador, J., Schwenk, D., VanderBilt, E., & Wallingford, M. et\u00a0al. (2020). Robothor: An open simulation-to-real embodied ai platform. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR42600.2020.00323"},{"key":"1670_CR13","doi-asserted-by":"crossref","unstructured":"Edmonds, M., Gao, F., Liu, H., Xie, X., Qi, S., Rothrock, B., et\u00a0al. (2019). A tale of two explanations: Enhancing human trust by explaining robot behavior. Science Robotics, 4(37), eaay4663.","DOI":"10.1126\/scirobotics.aay4663"},{"key":"1670_CR14","doi-asserted-by":"crossref","unstructured":"Edmonds, M., Gao, F., Xie, X., Liu, H., Qi, S., Zhu, Y., Rothrock, B., & Zhu, S. C. (2017). Feeling the force: Integrating force and pose for fluent discovery through imitation learning to open medicine bottles. In IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS).","DOI":"10.1109\/IROS.2017.8206196"},{"key":"1670_CR15","doi-asserted-by":"crossref","unstructured":"Furrer, F., Novkovic, T., Fehr, M., Gawel, A., Grinvald, M., Sattler, T., Siegwart, R., & Nieto J. (2018). Incremental object database: Building 3d models from multiple partial observations. In IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS).","DOI":"10.1109\/IROS.2018.8594391"},{"key":"1670_CR16","doi-asserted-by":"crossref","unstructured":"Garrett, C. R., Paxton, C., Lozano-P\u00e9rez, T., Kaelbling, L. P., & Fox D. (2020). Online replanning in belief space for partially observable task and motion problems. In IEEE International Conference on Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA40945.2020.9196681"},{"key":"1670_CR17","doi-asserted-by":"crossref","unstructured":"Gibson, J. J. (1950). The perception of the visual world. Houghton Mifflin.","DOI":"10.2307\/1418003"},{"key":"1670_CR18","unstructured":"Gibson, J. J. (1966). The senses considered as perceptual systems. Houghton Mifflin."},{"key":"1670_CR19","doi-asserted-by":"crossref","unstructured":"Grinvald, M., Furrer, F., Novkovic, T., Chung, J. J., Cadena, C., Siegwart, R., & Nieto, J. (2019). Volumetric instance-aware semantic mapping and 3d object discovery. IEEE Robotics and Automation Letters (RA-L), 4(3), 3037\u20133044.","DOI":"10.1109\/LRA.2019.2923960"},{"key":"1670_CR20","doi-asserted-by":"crossref","unstructured":"Gupta, S., Arbel\u00e1ez, P., Girshick, R., & Malik J. (2015). Aligning 3d models to rgb-d images of cluttered scenes. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7299105"},{"key":"1670_CR21","doi-asserted-by":"crossref","unstructured":"Han, L., Zheng, T., Xu, L., & Fang, L. (2020). Occuseg: Occupancy-aware 3d instance segmentation. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR42600.2020.00301"},{"key":"1670_CR22","doi-asserted-by":"crossref","unstructured":"Han, M., Zhang, Z., Jiao, Z., Xie, X., Zhu, Y., Zhu, S. C., & Liu H. (2021). Reconstructing interactive 3d scenes by panoptic mapping and cad model alignments. In IEEE International Conference on Robotics and Automation (ICRA). IEEE.","DOI":"10.1109\/ICRA48506.2021.9561546"},{"key":"1670_CR23","volume-title":"Multiple view geometry in computer vision","author":"R Hartley","year":"2003","unstructured":"Hartley, R., & Zisserman, A. (2003). Multiple view geometry in computer vision. Cambridge University Press."},{"key":"1670_CR24","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick R. (2017). Mask r-cnn. In International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2017.322"},{"key":"1670_CR25","doi-asserted-by":"crossref","unstructured":"Hoang, D. C., Lilienthal, A. J., & Stoyanov, T. (2020). Panoptic 3d mapping and object pose estimation using adaptively weighted semantic information. IEEE Robotics and Automation Letters (RA-L), 5(2), 1962\u20131969.","DOI":"10.1109\/LRA.2020.2970682"},{"key":"1670_CR26","doi-asserted-by":"crossref","unstructured":"Hua, B. S., Pham, Q. H., Nguyen, D. T., Tran, M. K., Yu, L. F., & Yeung S. K. (2016). Scenenn: A scene meshes dataset with annotations. In International Conference on 3D Vision (3DV).","DOI":"10.1109\/3DV.2016.18"},{"key":"1670_CR27","doi-asserted-by":"crossref","unstructured":"Hua, B. S., Tran, M. K., & Yeung, S. K. (2018). Pointwise convolutional neural networks. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2018.00109"},{"key":"1670_CR28","unstructured":"Huang, S., Qi, S., Xiao, Y., Zhu, Y., Wu, Y. N., & Zhu, S. C. (2018a). Cooperative holistic scene understanding: Unifying 3d object, layout, and camera pose estimation. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"1670_CR29","doi-asserted-by":"crossref","unstructured":"Huang, S., Qi, S., Zhu, Y., Xiao, Y., Xu, Y., & Zhu, S. C. (2018b). Holistic 3d scene parsing and reconstruction from a single rgb image. In European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-030-01234-2_12"},{"key":"1670_CR30","doi-asserted-by":"crossref","unstructured":"Ikeuchi, K., & Hebert M. (1992). Task-oriented vision. In IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS).","DOI":"10.1109\/IROS.1992.602084"},{"key":"1670_CR31","doi-asserted-by":"crossref","unstructured":"Jia, B., Chen, Y., Huang, S., Zhu, Y., & Zhu, S. C. (2020). Lemma: A multi-view dataset for learning multi-agent multi-task activities. In European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-030-58574-7_46"},{"issue":"9","key":"1670_CR32","doi-asserted-by":"publisher","first-page":"920","DOI":"10.1007\/s11263-018-1103-5","volume":"126","author":"C Jiang","year":"2018","unstructured":"Jiang, C., Qi, S., Zhu, Y., Huang, S., Lin, J., Yu, L. F., et al. (2018). Configurable 3d scene synthesis and 2d image rendering with per-pixel ground truth using stochastic grammars. International Journal of Computer Vision (IJCV), 126(9), 920\u2013941.","journal-title":"International Journal of Computer Vision (IJCV)"},{"key":"1670_CR33","doi-asserted-by":"crossref","unstructured":"Jiao, Z., Niu, Y., Zhang, Z., Zhu, S. C., Zhu, Y., & Liu, H. (2022). Sequential Manipulation Planning on Scene Graph. In IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS).","DOI":"10.1109\/IROS47612.2022.9981735"},{"key":"1670_CR34","doi-asserted-by":"crossref","unstructured":"Jiao, Z., Zhang, Z., Jiang, X., Han, D., Zhu, S. C., Zhu, Y., & Liu, H. (2021a). Consolidating kinematic models to promote coordinated mobile manipulations. In IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS).","DOI":"10.1109\/IROS51168.2021.9636351"},{"key":"1670_CR35","doi-asserted-by":"crossref","unstructured":"Jiao, Z., Zhang, Z., Wang, W., Han, D., Zhu, S. C., Zhu, Y., & Liu H. (2021b). Efficient task planning for mobile manipulation: A virtual kinematic chain perspective. In IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS).","DOI":"10.1109\/IROS51168.2021.9636554"},{"issue":"4","key":"1670_CR36","doi-asserted-by":"publisher","first-page":"325","DOI":"10.1007\/BF02278710","volume":"38","author":"R Jonker","year":"1987","unstructured":"Jonker, R., & Volgenant, A. (1987). A shortest augmenting path algorithm for dense and sparse linear assignment problems. Computing, 38(4), 325\u2013340.","journal-title":"Computing"},{"issue":"6506","key":"1670_CR37","doi-asserted-by":"publisher","first-page":"915","DOI":"10.1126\/science.aaz7597","volume":"369","author":"LP Kaelbling","year":"2020","unstructured":"Kaelbling, L. P. (2020). The foundation of efficient robot learning. Science, 369(6506), 915\u2013916.","journal-title":"Science"},{"key":"1670_CR38","doi-asserted-by":"crossref","unstructured":"Kaelbling, L. P., & Lozano-P\u00e9rez, T. (2011). Hierarchical task and motion planning in the now. In IEEE International Conference on Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA.2011.5980391"},{"key":"1670_CR39","doi-asserted-by":"crossref","unstructured":"Kirillov, A., He, K., Girshick, R., Rother, C., & Doll\u00e1r, P. (2019). Panoptic segmentation. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2019.00963"},{"key":"1670_CR40","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511984037","volume-title":"Perception as Bayesian inference","author":"DC Knill","year":"1996","unstructured":"Knill, D. C., & Richards, W. (1996). Perception as Bayesian inference. Cambridge University Press."},{"key":"1670_CR41","doi-asserted-by":"crossref","unstructured":"Li, X., Liu, S., Kim, K., Wang, X., Yang, M. H., & Kautz, J. (2019). Putting humans in a scene: Learning affordance in 3d indoor environments. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2019.01265"},{"key":"1670_CR42","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, H., Yi, L., Guibas, L. J., Abbott, A. L., & Song, S. (2020). Category-level articulated object pose estimation. In International Conference on Computer Vision (ICCV).","DOI":"10.1109\/CVPR42600.2020.00376"},{"key":"1670_CR43","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft coco: Common objects in context. In European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1670_CR44","doi-asserted-by":"crossref","unstructured":"Liu, H., Zhang, Y., Si, W., Xie, X., Zhu, Y., & Zhu, S. C. (2018a). Interactive robot knowledge patching using augmented reality. In IEEE International Conference on Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA.2018.8462837"},{"key":"1670_CR45","doi-asserted-by":"crossref","unstructured":"Liu, H., Zhang, C., Zhu, Y., Jiang, C., & Zhu S. C. (2019). Mirroring without overimitation: Learning functionally equivalent manipulation actions. In AAAI Conference on Artificial Intelligence (AAAI).","DOI":"10.1609\/aaai.v33i01.33018025"},{"issue":"4","key":"1670_CR46","first-page":"1","volume":"37","author":"L Liu","year":"2018","unstructured":"Liu, L., Xia, X., Sun, H., Shen, Q., Xu, J., Chen, B., et al. (2018). Object-aware guidance for autonomous scene reconstruction. ACM Transactions on Graphics (TOG), 37(4), 1\u201312.","journal-title":"ACM Transactions on Graphics (TOG)"},{"issue":"06","key":"1670_CR47","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1142\/S0218195902001006","volume":"12","author":"G Malandain","year":"2002","unstructured":"Malandain, G., & Boissonnat, J. D. (2002). Computing the diameter of a point set. International Journal of Computational Geometry & Applications, 12(06), 489\u2013509.","journal-title":"International Journal of Computational Geometry & Applications"},{"issue":"5","key":"1670_CR48","doi-asserted-by":"publisher","first-page":"530","DOI":"10.1109\/TPAMI.2004.1273918","volume":"26","author":"DR Martin","year":"2004","unstructured":"Martin, D. R., Fowlkes, C. C., & Malik, J. (2004). Learning to detect natural image boundaries using local brightness, color, and texture cues. Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 26(5), 530\u2013549.","journal-title":"Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"1670_CR49","doi-asserted-by":"crossref","unstructured":"Mart\u00edn-Mart\u00edn R., & Brock, O. (2019). Coupled recursive estimation for online interactive perception of articulated objects. International Journal of Robotics Research (IJRR), 1\u201337.","DOI":"10.1177\/0278364919848850"},{"key":"1670_CR50","doi-asserted-by":"crossref","unstructured":"McCormac, J., Clark, R., Bloesch, M., Davison, A., & Leutenegger S. (2018). Fusion++: Volumetric object-level slam. In International Conference on 3D Vision (3DV).","DOI":"10.1109\/3DV.2018.00015"},{"key":"1670_CR51","doi-asserted-by":"crossref","unstructured":"McCormac, J., Handa, A., Davison, A., & Leutenegger, S. (2017). Semanticfusion: Dense 3d semantic mapping with convolutional neural networks. In IEEE International Conference on Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA.2017.7989538"},{"issue":"4","key":"1670_CR52","doi-asserted-by":"publisher","first-page":"237","DOI":"10.1109\/TCDS.2016.2614992","volume":"8","author":"H Min","year":"2016","unstructured":"Min, H., Luo, R., Zhu, J., Bi, S., et al. (2016). Affordance research in developmental robotics: A survey. IEEE Transactions on Cognitive and Developmental Systems, 8(4), 237\u2013255.","journal-title":"IEEE Transactions on Cognitive and Developmental Systems"},{"issue":"1\u20133","key":"1670_CR53","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1016\/0004-3702(92)90007-K","volume":"58","author":"S Minton","year":"1992","unstructured":"Minton, S., Johnston, M. D., Philips, A. B., & Laird, P. (1992). Minimizing conflicts: A heuristic repair method for constraint satisfaction and scheduling problems. Artificial Intelligence, 58(1\u20133), 161\u2013205.","journal-title":"Artificial Intelligence"},{"key":"1670_CR54","doi-asserted-by":"crossref","unstructured":"Mo, K., Zhu, S., Chang, A. X., Yi, L., Tripathi, S., Guibas, L. J., & Su, H. (2019). Partnet: A large-scale benchmark for fine-grained and hierarchical part-level 3d object understanding. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2019.00100"},{"key":"1670_CR55","doi-asserted-by":"crossref","unstructured":"Mor\u00e9, J. J. (1978). The Levenberg-Marquardt algorithm: Implementation and theory. In Numerical analysis (pp. 105\u2013116). Springer.","DOI":"10.1007\/BFb0067700"},{"key":"1670_CR56","doi-asserted-by":"crossref","unstructured":"Mur-Artal, R., & Tard\u00f3s, J. D. (2017). Orb-slam2: An open-source slam system for monocular, stereo, and rgb-d cameras. IEEE Transactions on Robotics (T-RO), 33(5), 1255\u20131262.","DOI":"10.1109\/TRO.2017.2705103"},{"key":"1670_CR57","doi-asserted-by":"crossref","unstructured":"Myers, A., Teo, C. L., Ferm\u00fcller, C., & Aloimonos, Y. (2015). Affordance detection of tool parts from geometric features. In IEEE International Conference on Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA.2015.7139369"},{"key":"1670_CR58","doi-asserted-by":"crossref","unstructured":"Narita, G., Seno, T., Ishikawa, T., & Kaji Y. (2019). Panopticfusion: Online volumetric semantic mapping at the level of stuff and things. In IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS).","DOI":"10.1109\/IROS40897.2019.8967890"},{"key":"1670_CR59","doi-asserted-by":"crossref","unstructured":"Oleynikova, H., Taylor, Z., Fehr, M., Siegwart, R., & Nieto, J. (2017). Voxblox: Incremental 3d euclidean signed distance fields for on-board mav planning. In IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS).","DOI":"10.1109\/IROS.2017.8202315"},{"key":"1670_CR60","doi-asserted-by":"crossref","unstructured":"Pham, Q. H., Hua, B. S., Nguyen, T., & Yeung, S. K. (2019a). Real-time progressive 3d semantic segmentation for indoor scenes. In Proceedings of Winter Conference on Applications of Computer Vision (WACV).","DOI":"10.1109\/WACV.2019.00121"},{"key":"1670_CR61","doi-asserted-by":"crossref","unstructured":"Pham, Q. H., Nguyen, T., Hua, B. S., Roig, G., & Yeung, S. K. (2019b). Jsis3d: Joint semantic-instance segmentation of 3d point clouds with multi-task pointwise networks and multi-value conditional random fields. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2019.00903"},{"key":"1670_CR62","unstructured":"Pham, Q. H., Tran, M. K., Li, W., Xiang, S., Zhou, H., Nie, W., Liu, A., Su, Y., Tran, M. T., & Bui, N. M. et\u00a0al. (2018). Shrec\u201918: Rgb-d object-to-cad retrieval. In 3DOR: Proceedings of the 11th Eurographics Workshop on 3D Object Retrieval."},{"key":"1670_CR63","doi-asserted-by":"crossref","unstructured":"Pronobis, A., & Jensfelt, P. (2012). Large-scale semantic mapping and reasoning with heterogeneous modalities. In IEEE International Conference on Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA.2012.6224637"},{"key":"1670_CR64","doi-asserted-by":"publisher","first-page":"2538","DOI":"10.1109\/TPAMI.2020.2976971","volume":"43","author":"S Qi","year":"2020","unstructured":"Qi, S., Jia, B., Huang, S., Wei, P., & Zhu, S. C. (2020). A generalized earley parser for human activity parsing and prediction. Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 43, 2538\u20132554.","journal-title":"Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"1670_CR65","doi-asserted-by":"crossref","unstructured":"Qi, S., Zhu, Y., Huang, S., Jiang, C., & Zhu, S. C. (2018). Human-centric indoor scene synthesis using stochastic grammar. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2018.00618"},{"issue":"6","key":"1670_CR66","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2016). Faster r-cnn: Towards real-time object detection with region proposal networks. Transactions on Pattern Analysis and Machine Intelligence (TPAMI), 39(6), 1137\u20131149.","journal-title":"Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"1670_CR67","doi-asserted-by":"crossref","unstructured":"Rosinol, A., Gupta, A., Abate, M., Shi, J., & Carlone, L. (2020). 3d dynamic scene graphs: Actionable spatial perception with places, objects, and humans. In Robotics: Science and Systems (RSS).","DOI":"10.15607\/RSS.2020.XVI.079"},{"key":"1670_CR68","doi-asserted-by":"crossref","unstructured":"Savva, M., Kadian, A., Maksymets, O., Zhao, Y., Wijmans, E., Jain, B., Straub, J., Liu, J., Koltun, V., & Malik J. et\u00a0al. (2019). Habitat: A platform for embodied ai research. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/ICCV.2019.00943"},{"key":"1670_CR69","doi-asserted-by":"crossref","unstructured":"Silberman, N., Hoiem, D., Kohli, P., & Fergus, R. (2012). Indoor segmentation and support inference from rgbd images. In European Conference on Computer Vision (ECCV). Springer.","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"1670_CR70","doi-asserted-by":"crossref","unstructured":"Song, S., Lichtenberg, S. P., & Xiao, J. (2015). Sun rgb-d: A rgb-d scene understanding benchmark suite. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"1670_CR71","doi-asserted-by":"crossref","unstructured":"Song, S., Yu, F., Zeng, A., Chang, A. X., Savva, M., & Funkhouser, T. (2017). Semantic scene completion from a single depth image. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2017.28"},{"key":"1670_CR72","doi-asserted-by":"crossref","unstructured":"Srivastava, S., Fang, E., Riano, L., Chitnis, R., Russell, S., & Abbeel, P. (2014). Combined task and motion planning through an extensible planner-independent interface layer. In IEEE International Conference on Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA.2014.6906922"},{"key":"1670_CR73","doi-asserted-by":"publisher","first-page":"477","DOI":"10.1613\/jair.3229","volume":"41","author":"J Sturm","year":"2011","unstructured":"Sturm, J., Stachniss, C., & Burgard, W. (2011). A probabilistic framework for learning kinematic models of articulated objects. Journal of Artificial Intelligence Research, 41, 477\u2013526.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"1670_CR74","doi-asserted-by":"crossref","unstructured":"Sui, Z., Chang, H., Xu, N., & Jenkins, O. C. (2020). Geofusion: Geometric consistency informed scene estimation in dense clutter. IEEE Robotics and Automation Letters (RA-L), 5(4), 5913\u20135920.","DOI":"10.1109\/LRA.2020.3010443"},{"key":"1670_CR75","doi-asserted-by":"crossref","unstructured":"Taguchi, Y., Jian, Y. D., Ramalingam, S., & Feng, C. (2013). Point-plane slam for hand-held 3d sensors. In IEEE International Conference on Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA.2013.6631318"},{"key":"1670_CR76","doi-asserted-by":"crossref","unstructured":"Wada, K., Sucar, E., James, S., Lenton, D., & Davison, A. J. (2020). Morefusion: Multi-object reasoning for 6d pose estimation from volumetric fusion. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR42600.2020.01455"},{"key":"1670_CR77","doi-asserted-by":"crossref","unstructured":"Wald, J., Dhamo, H., Navab, N., & Tombari, F. (2020). Learning 3d semantic scene graphs from 3d indoor reconstructions. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR42600.2020.00402"},{"key":"1670_CR78","unstructured":"Wu, Y., Kirillov, A., Massa, F., Lo, W. Y., & Girshick, R. (2019). Detectron2. https:\/\/github.com\/facebookresearch\/detectron2"},{"key":"1670_CR79","doi-asserted-by":"crossref","unstructured":"Xiang, F., Qin, Y., Mo, K., Xia, Y., Zhu, H., Liu, F., Liu, M., Jiang, H., Yuan, Y., & Wang H, et\u00a0al. (2020). Sapien: A simulated part-based interactive environment. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR42600.2020.01111"},{"key":"1670_CR80","doi-asserted-by":"crossref","unstructured":"Xia, F., Shen, W. B., Li, C., Kasimbeg, P., Tchapmi, M. E., Toshev, A., et\u00a0al. (2020). Interactive Gibson benchmark: A benchmark for interactive navigation in cluttered environments. IEEE Robotics and Automation Letters (RA-L), 5(2), 713\u2013720.","DOI":"10.1109\/LRA.2020.2965078"},{"key":"1670_CR81","doi-asserted-by":"crossref","unstructured":"Xie, X., Liu, H., Zhang, Z., Qiu, Y., Gao, F., Qi, S., Zhu, Y., & Zhu, S. C. (2019). Vrgym: A virtual testbed for physical and interactive ai. In Proceedings of the ACM Turing Celebration Conference-China, pp. 1\u20136.","DOI":"10.1145\/3321408.3322633"},{"issue":"6","key":"1670_CR82","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2816795.2818075","volume":"34","author":"K Xu","year":"2015","unstructured":"Xu, K., Huang, H., Shi, Y., Li, H., Long, P., Caichen, J., et al. (2015). Autoscanning for coupled scene reconstruction and proactive object analysis. ACM Transactions on Graphics (TOG), 34(6), 1\u201314.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"1670_CR83","doi-asserted-by":"crossref","unstructured":"Yang, S., & Scherer, S. (2019a). Cubeslam: Monocular 3-d object slam. IEEE Transactions on Robotics (T-RO), 35(4), 925\u2013938.","DOI":"10.1109\/TRO.2019.2909168"},{"key":"1670_CR84","doi-asserted-by":"crossref","unstructured":"Yang, S., & Scherer, S. (2019b). Monocular object and plane slam in structured environments. IEEE Robotics and Automation Letters (RA-L), 4(4), 3145\u20133152.","DOI":"10.1109\/LRA.2019.2924848"},{"key":"1670_CR85","doi-asserted-by":"crossref","unstructured":"Yi, L., Zhao, W., Wang, H., Sung, M., & Guibas, L. J. (2019). Gspn: Generative shape proposal network for 3d instance segmentation in point cloud. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2019.00407"},{"key":"1670_CR86","doi-asserted-by":"crossref","unstructured":"Yuan, T., Liu, H., Fan, L., Zheng, Z., Gao, T., Zhu, Y., & Zhu, S. C. (2020). Joint inference of states, robot knowledge, and human (false-)beliefs. In IEEE International Conference on Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA40945.2020.9197355"},{"issue":"4","key":"1670_CR87","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2010324.1964981","volume":"30","author":"LF Yu","year":"2011","unstructured":"Yu, L. F., Yeung, S. K., Tang, C. K., Terzopoulos, D., Chan, T. F., & Osher, S. J. (2011). Make it home: Automatic optimization of furniture arrangement. ACM Transactions on Graphics (TOG), 30(4), 1\u201312.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"1670_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Jiao, Z., Wang, W., Zhu, Y., Zhu, S. C., & Liu, H. (2022). Understanding Physical Effects for Effective Tool-use. IEEE Robotics and Automation Letters (RA-L), 7(4), 9469\u20139476.","DOI":"10.1109\/LRA.2022.3191793"},{"key":"1670_CR89","doi-asserted-by":"publisher","first-page":"179118","DOI":"10.1109\/ACCESS.2019.2958671","volume":"7","author":"J Zhang","year":"2019","unstructured":"Zhang, J., Zhao, X., Chen, Z., & Lu, Z. (2019). A review of deep learning-based semantic segmentation for point cloud. IEEE Access, 7, 179118\u2013179133.","journal-title":"IEEE Access"},{"issue":"6","key":"1670_CR90","doi-asserted-by":"publisher","first-page":"1245","DOI":"10.1137\/0218082","volume":"18","author":"K Zhang","year":"1989","unstructured":"Zhang, K., & Shasha, D. (1989). Simple fast algorithms for the editing distance between trees and related problems. SIAM Journal on Computing, 18(6), 1245\u20131262.","journal-title":"SIAM Journal on Computing"},{"key":"1670_CR91","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Zhu, Y., & Zhu, S. C. (2020). Graph-based hierarchical knowledge representation for robot task transfer from virtual to physical world. In IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS).","DOI":"10.1109\/IROS45743.2020.9340843"},{"key":"1670_CR92","unstructured":"Zhao, Y., & Zhu, S. C. (2011). Image parsing with stochastic scene grammar. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"1670_CR93","doi-asserted-by":"crossref","unstructured":"Zhao, Y., & Zhu, S. C. (2013). Scene parsing by integrating function, geometry and appearance models. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2013.401"},{"issue":"2","key":"1670_CR94","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1007\/s11263-014-0795-4","volume":"112","author":"B Zheng","year":"2015","unstructured":"Zheng, B., Zhao, Y., Yu, J., Ikeuchi, K., & Zhu, S. C. (2015). Scene understanding by reasoning stability and safety. International Journal of Computer Vision (IJCV), 112(2), 221\u2013238.","journal-title":"International Journal of Computer Vision (IJCV)"},{"issue":"4","key":"1670_CR95","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1561\/0600000018","volume":"2","author":"SC Zhu","year":"2007","unstructured":"Zhu, S. C., & Mumford, D. (2007). A stochastic grammar of images. Foundations and Trends in Computer Graphics and Vision, 2(4), 259\u2013362.","journal-title":"Foundations and Trends in Computer Graphics and Vision"},{"issue":"3","key":"1670_CR96","doi-asserted-by":"publisher","first-page":"310","DOI":"10.1016\/j.eng.2020.01.011","volume":"6","author":"Y Zhu","year":"2020","unstructured":"Zhu, Y., Gao, T., Fan, L., Huang, S., Edmonds, M., Liu, H., et al. (2020). Dark, beyond deep: A paradigm shift to cognitive ai with humanlike common sense. Engineering, 6(3), 310\u2013345.","journal-title":"Engineering"},{"key":"1670_CR97","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Jiang, C., Zhao, Y., Terzopoulos, D., & Zhu, S. C. (2016). Inferring forces and learning human utilities from videos. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.415"},{"key":"1670_CR98","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Zhao, Y., & Zhu, S. C. (2015). Understanding tools: Task-oriented object modeling, learning and recognition. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298903"},{"issue":"2","key":"1670_CR99","doi-asserted-by":"publisher","first-page":"143","DOI":"10.1007\/s11263-018-1133-z","volume":"127","author":"C Zou","year":"2019","unstructured":"Zou, C., Guo, R., Li, Z., & Hoiem, D. (2019). Complete 3d scene parsing from an rgbd image. International Journal of Computer Vision (IJCV), 127(2), 143\u2013162.","journal-title":"International Journal of Computer Vision (IJCV)"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01670-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-022-01670-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01670-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,19]],"date-time":"2023-02-19T17:29:04Z","timestamp":1676827744000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-022-01670-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,9,20]]},"references-count":99,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2022,12]]}},"alternative-id":["1670"],"URL":"https:\/\/doi.org\/10.1007\/s11263-022-01670-0","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,9,20]]},"assertion":[{"value":"19 February 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 August 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 September 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}