{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T13:07:29Z","timestamp":1770815249369,"version":"3.50.1"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012335","type":"print"},{"value":"9783030012342","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01234-2_12","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T16:13:11Z","timestamp":1538755991000},"page":"194-211","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":93,"title":["Holistic 3D Scene Parsing and Reconstruction from a Single RGB Image"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1524-7148","authenticated-orcid":false,"given":"Siyuan","family":"Huang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4070-733X","authenticated-orcid":false,"given":"Siyuan","family":"Qi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7024-1545","authenticated-orcid":false,"given":"Yixin","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Yinxue","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Yuanlu","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Song-Chun","family":"Zhu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"12_CR1","series-title":"Studies in Computational Intelligence","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-28661-2_2","volume-title":"Machine Learning for Computer Vision","author":"S Soatto","year":"2013","unstructured":"Soatto, S.: Actionable information in vision. In: Cipolla, R., Battiato, S., Farinella, G. (eds.) Machine Learning for Computer Vision. SCI, vol. 411. Springer, Heidelberg (2013). https:\/\/doi.org\/10.1007\/978-3-642-28661-2_2"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Qi, S., Zhu, Y., Huang, S., Jiang, C., Zhu, S.C.: Human-centric indoor scene synthesis using stochastic grammar. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00618"},{"key":"12_CR3","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Koppula, H., Saxena, A.: Hallucinated humans as the hidden context for labeling 3D scenes. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.385"},{"key":"12_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"482","DOI":"10.1007\/978-3-642-15561-1_35","volume-title":"Computer Vision \u2013 ECCV 2010","author":"A Gupta","year":"2010","unstructured":"Gupta, A., Efros, A.A., Hebert, M.: Blocks world revisited: image understanding using qualitative geometry and mechanics. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010. LNCS, vol. 6314, pp. 482\u2013496. Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-15561-1_35"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Liu, X., Zhao, Y., Zhu, S.C.: Single-view 3D scene parsing by attributed grammar. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.93"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Zheng, B., Zhao, Y., Joey, C.Y., Ikeuchi, K., Zhu, S.C.: Detecting potential falling objects by inferring human action and natural disturbance. In: IEEE International Conference on Robotics and Automation (ICRA) (2014)","DOI":"10.1109\/ICRA.2014.6907351"},{"key":"12_CR7","doi-asserted-by":"publisher","first-page":"301","DOI":"10.1016\/j.tics.2006.05.002","volume":"10","author":"A Yuille","year":"2006","unstructured":"Yuille, A., Kersten, D.: Vision as Bayesian inference: analysis by synthesis? Trends Cogn. Sci. 10, 301\u2013308 (2006)","journal-title":"Trends Cogn. Sci."},{"key":"12_CR8","volume-title":"Lectures in Pattern Theory I, II and III: Pattern Analysis, Pattern Synthesis and Regular Structures","author":"U Grenander","year":"1976","unstructured":"Grenander, U.: Lectures in Pattern Theory I, II and III: Pattern Analysis, Pattern Synthesis and Regular Structures. Springer, New York (1976)"},{"key":"12_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/978-3-319-10584-0_11","volume-title":"Computer Vision \u2013 ECCV 2014","author":"MM Loper","year":"2014","unstructured":"Loper, M.M., Black, M.J.: OpenDR: an approximate differentiable renderer. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8695, pp. 154\u2013169. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10584-0_11"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Dai, J., He, K., Sun, J.: BoxSup: exploiting bounding boxes to supervise convolutional networks for semantic segmentation. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.191"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Zheng, S., et al.: Conditional random fields as recurrent neural networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.179"},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Noh, H., Hong, S., Han, B.: Learning deconvolution network for semantic segmentation. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.178"},{"key":"12_CR13","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"LC Chen","year":"2017","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI) 40, 834\u2013848 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI)"},{"key":"12_CR14","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"12_CR15","doi-asserted-by":"crossref","unstructured":"Lin, G., Milan, A., Shen, C., Reid, I.: RefineNet: multi-path refinement networks for high-resolution semantic segmentation. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.549"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., Jia, J.: Pyramid scene parsing network. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.660"},{"key":"12_CR17","unstructured":"Zhao, Y., Zhu, S.C.: Image parsing with stochastic scene grammar. In: Conference on Neural Information Processing Systems (NIPS) (2011)"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Zhu, S.C.: Scene parsing by integrating function, geometry and appearance models. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.401"},{"key":"12_CR19","doi-asserted-by":"crossref","unstructured":"Choi, W., Chao, Y.W., Pantofaru, C., Savarese, S.: Understanding indoor scenes using 3D geometric phrases. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.12"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Lin, D., Fidler, S., Urtasun, R.: Holistic scene understanding for 3D object detection with RGBD cameras. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.179"},{"key":"12_CR21","doi-asserted-by":"crossref","unstructured":"Guo, R., Hoiem, D.: Support surface prediction in indoor scenes. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.266"},{"key":"12_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"668","DOI":"10.1007\/978-3-319-10599-4_43","volume-title":"Computer Vision \u2013 ECCV 2014","author":"Y Zhang","year":"2014","unstructured":"Zhang, Y., Song, S., Tan, P., Xiao, J.: PanoContext: a whole-room 3D context model for panoramic scene understanding. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8694, pp. 668\u2013686. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10599-4_43"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Physically-based rendering for indoor scene understanding using convolutional neural networks. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.537"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Zou, C., Li, Z., Hoiem, D.: Complete 3D scene parsing from single RGBD image. arXiv preprint arXiv:1710.09490 (2017)","DOI":"10.1007\/s11263-018-1133-z"},{"key":"12_CR25","doi-asserted-by":"publisher","first-page":"577","DOI":"10.1145\/1073204.1073232","volume":"24","author":"D Hoiem","year":"2005","unstructured":"Hoiem, D., Efros, A.A., Hebert, M.: Automatic photo pop-up. ACM Trans. Graph. (TOG) 24, 577\u2013584 (2005)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"12_CR26","unstructured":"Han, F., Zhu, S.C.: Bottom-up\/top-down image parsing by attribute graph grammar. In: ICCV (2005)"},{"key":"12_CR27","unstructured":"Saxena, A., Chung, S.H., Ng, A.Y.: Learning depth from single monocular images. In: Conference on Neural Information Processing Systems (NIPS) (2006)"},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Hedau, V., Hoiem, D., Forsyth, D.: Recovering the spatial layout of cluttered rooms. In: CVPR (2009)","DOI":"10.1109\/ICCV.2009.5459411"},{"key":"12_CR29","doi-asserted-by":"crossref","unstructured":"Lee, D.C., Hebert, M., Kanade, T.: Geometric reasoning for single image structure recovery. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206872"},{"key":"12_CR30","doi-asserted-by":"crossref","unstructured":"Mallya, A., Lazebnik, S.: Learning informative edge maps for indoor scene layout prediction. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.113"},{"key":"12_CR31","doi-asserted-by":"crossref","unstructured":"Dasgupta, S., Fang, K., Chen, K., Savarese, S.: Delay: robust spatial layout estimation for cluttered indoor scenes. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.73"},{"key":"12_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1007\/978-3-319-54193-8_3","volume-title":"Computer Vision \u2013 ACCV 2016","author":"Y Ren","year":"2017","unstructured":"Ren, Y., Li, S., Chen, C., Kuo, C.-C.J.: A coarse-to-fine indoor layout estimation (CFILE) method. In: Lai, S.-H., Lepetit, V., Nishino, K., Sato, Y. (eds.) ACCV 2016. LNCS, vol. 10115, pp. 36\u201351. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54193-8_3"},{"key":"12_CR33","doi-asserted-by":"crossref","unstructured":"Izadinia, H., Shan, Q., Seitz, S.M.: IM2CAD. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.260"},{"key":"12_CR34","doi-asserted-by":"crossref","unstructured":"Lee, C.Y., Badrinarayanan, V., Malisiewicz, T., Rabinovich, A.: RoomNet: end-to-end room layout estimation. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.521"},{"key":"12_CR35","doi-asserted-by":"crossref","unstructured":"Zhao, H., Lu, M., Yao, A., Guo, Y., Chen, Y., Zhang, L.: Physics inspired optimization on semantic transfer features: an alternative method for room layout estimation. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.99"},{"key":"12_CR36","doi-asserted-by":"crossref","unstructured":"Salas-Moreno, R.F., Newcombe, R.A., Strasdat, H., Kelly, P.H., Davison, A.J.: SLAM++: simultaneous localisation and mapping at the level of objects. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.178"},{"key":"12_CR37","doi-asserted-by":"crossref","unstructured":"Aubry, M., Maturana, D., Efros, A.A., Russell, B.C., Sivic, J.: Seeing 3D chairs: exemplar part-based 2D\u20133D alignment using a large dataset of cad models. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.487"},{"key":"12_CR38","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"478","DOI":"10.1007\/978-3-319-10599-4_31","volume-title":"Computer Vision \u2013 ECCV 2014","author":"JJ Lim","year":"2014","unstructured":"Lim, J.J., Khosla, A., Torralba, A.: FPM: fine pose parts-based model with 3D CAD models. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8694, pp. 478\u2013493. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10599-4_31"},{"key":"12_CR39","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"634","DOI":"10.1007\/978-3-319-10599-4_41","volume-title":"Computer Vision \u2013 ECCV 2014","author":"S Song","year":"2014","unstructured":"Song, S., Xiao, J.: Sliding shapes for 3D object detection in depth images. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8694, pp. 634\u2013651. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10599-4_41"},{"key":"12_CR40","doi-asserted-by":"crossref","unstructured":"Tulsiani, S., Malik, J.: Viewpoints and keypoints. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298758"},{"key":"12_CR41","doi-asserted-by":"crossref","unstructured":"Bansal, A., Russell, B., Gupta, A.: Marr revisited: 2D\u20133D alignment via surface normal prediction. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.642"},{"key":"12_CR42","doi-asserted-by":"crossref","unstructured":"Song, S., Xiao, J.: Deep sliding shapes for Amodal 3D object detection in RGB-D images. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.94"},{"key":"12_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"365","DOI":"10.1007\/978-3-319-46466-4_22","volume-title":"Computer Vision \u2013 ECCV 2016","author":"W J","year":"2016","unstructured":"J, W., et al.: Single image 3D interpreter network. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 365\u2013382. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_22"},{"key":"12_CR44","doi-asserted-by":"crossref","unstructured":"Deng, Z., Latecki, L.J.: Amodal detection of 3D objects: inferring 3D bounding boxes from 2D ones in RGB-depth images. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.50"},{"key":"12_CR45","doi-asserted-by":"crossref","unstructured":"Song, S., Lichtenberg, S.P., Xiao, J.: Sun RGB-D: a RGB-D scene understanding benchmark suite. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"12_CR46","doi-asserted-by":"crossref","unstructured":"Song, S., Yu, F., Zeng, A., Chang, A.X., Savva, M., Funkhouser, T.: Semantic scene completion from a single depth image. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.28"},{"key":"12_CR47","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Saxena, A.: Modeling high-dimensional humans for activity anticipation using Gaussian process latent CRFs. In: Robotics: Science and Systems (RSS) (2014)","DOI":"10.15607\/RSS.2014.X.015"},{"key":"12_CR48","unstructured":"Xie, D., Todorovic, S., Zhu, S.C.: Inferring \u201cdark matter\u201d and \u201cdark energy\u201d from videos. In: ICCV (2013)"},{"key":"12_CR49","unstructured":"Wu, J., Wang, Y., Xue, T., Sun, X., Freeman, W.T., Tenenbaum, J.B.: MarrNet: 3D shape reconstruction via 2.5D sketches. In: Conference on Neural Information Processing Systems (NIPS) (2017)"},{"key":"12_CR50","doi-asserted-by":"crossref","unstructured":"Dai, J., Qi, H., Xiong, Y., Li, Y., Zhang, G., Hu, H., Wei, Y.: Deformable convolutional networks. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.89"},{"key":"12_CR51","doi-asserted-by":"crossref","unstructured":"Bodla, N., Singh, B., Chellappa, R., Davis, L.S.: Soft-NMS - improving object detection with one line of code. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.593"},{"key":"12_CR52","unstructured":"Chang, A.X., et al.: ShapeNet: an information-rich 3D model repository. arXiv preprint arXiv:1512.03012 (2015)"},{"key":"12_CR53","doi-asserted-by":"crossref","unstructured":"Savva, M., Chang, A.X., Hanrahan, P.: Semantically-enriched 3D models for common-sense knowledge. In: CVPR Workshop (2015)","DOI":"10.1109\/CVPRW.2015.7301289"},{"key":"12_CR54","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1093\/biomet\/57.1.97","volume":"57","author":"WK Hastings","year":"1970","unstructured":"Hastings, W.K.: Monte Carlo sampling methods using Markov chains and their applications. Biometrika 57, 97\u2013109 (1970)","journal-title":"Biometrika"},{"key":"12_CR55","unstructured":"Zhang, Y., Yu, F., Song, S., Xu, P., Seff, A., Xiao, J.: Large-scale scene understanding challenge: room layout estimation. In: CVPR Workshop (2015)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01234-2_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,5]],"date-time":"2022-10-05T00:21:16Z","timestamp":1664929276000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01234-2_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012335","9783030012342"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01234-2_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}