{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T17:07:02Z","timestamp":1780765622253,"version":"3.54.1"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031728891","type":"print"},{"value":"9783031728907","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72890-7_5","type":"book-chapter","created":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T20:02:38Z","timestamp":1733515358000},"page":"71-87","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Pyramid Diffusion for\u00a0Fine 3D Large Scene Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-3690-5659","authenticated-orcid":false,"given":"Yuheng","family":"Liu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9209-2154","authenticated-orcid":false,"given":"Xinke","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2556-8667","authenticated-orcid":false,"given":"Xueting","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2684-0062","authenticated-orcid":false,"given":"Lu","family":"Qi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7595-0997","authenticated-orcid":false,"given":"Chongshou","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4848-2304","authenticated-orcid":false,"given":"Ming-Hsuan","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,12,7]]},"reference":[{"key":"5_CR1","doi-asserted-by":"crossref","unstructured":"Anvekar, T., Tabib, R.A., Hegde, D., Mudengudi, U.: Vg-vae: a venatus geometry point-cloud variational auto-encoder. In: CVPR (2022)","DOI":"10.1109\/CVPRW56347.2022.00336"},{"key":"5_CR2","unstructured":"Austin, J., Johnson, D.D., Ho, J., Tarlow, D., Van Den\u00a0Berg, R.: Structured denoising diffusion models in discrete state-spaces. In: NeurIPS (2021)"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Behley, J., et al.: Semantickitti: a dataset for semantic scene understanding of lidar sequences. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00939"},{"key":"5_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wang, G., Liu, Z.: Scenedreamer: unbounded 3d scene generation from 2d image collections. arXiv preprint arXiv: 2302.01330 (2023)","DOI":"10.1109\/TPAMI.2023.3321857"},{"key":"5_CR5","unstructured":"Cheng, A.C., Li, X., Liu, S., Sun, M., Yang, M.H.: Learning 3d dense correspondence via canonical point autoencoder. In: ECCV (2022)"},{"key":"5_CR6","unstructured":"Cheng, A.C., Li, X., Sun, M., Yang, M.H., Liu, S.: Learning 3d dense correspondence via canonical point autoencoder. In: NeurIPS (2021)"},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"\u00c7i\u00e7ek, \u00d6., Abdulkadir, A., Lienkamp, S.S., Brox, T., Ronneberger, O.: 3d u-net: learning dense volumetric segmentation from sparse annotation. In: MICCAI (2016)","DOI":"10.1007\/978-3-319-46723-8_49"},{"issue":"3","key":"5_CR8","doi-asserted-by":"publisher","first-page":"1682","DOI":"10.1109\/TCYB.2021.3108165","volume":"53","author":"Y Cong","year":"2021","unstructured":"Cong, Y., Chen, R., Ma, B., Liu, H., Hou, D., Yang, C.: A comprehensive study of 3-d vision-based robot manipulation. IEEE Trans. Cybern. 53(3), 1682\u20131698 (2021)","journal-title":"IEEE Trans. Cybern."},{"key":"5_CR9","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: NeurIPS (2021)"},{"key":"5_CR10","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? The kitti vision benchmark suite. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"5_CR11","doi-asserted-by":"crossref","unstructured":"Graham, B., Engelcke, M., Van Der\u00a0Maaten, L.: 3d semantic segmentation with submanifold sparse convolutional networks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00961"},{"key":"5_CR12","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. In: NeurIPS (2017)"},{"key":"5_CR13","unstructured":"Ho, J., et\u00a0al.: Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)"},{"key":"5_CR14","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: NeurIPS (2020)"},{"issue":"47","key":"5_CR15","first-page":"1","volume":"23","author":"J Ho","year":"2022","unstructured":"Ho, J., Saharia, C., Chan, W., Fleet, D.J., Norouzi, M., Salimans, T.: Cascaded diffusion models for high fidelity image generation. J. Mach. Learn. Res. 23(47), 1\u201333 (2022)","journal-title":"J. Mach. Learn. Res."},{"key":"5_CR16","unstructured":"Huang, W., Wang, C., Zhang, R., Li, Y., Wu, J., Fei-Fei, L.: Voxposer: composable 3d value maps for robotic manipulation with language models. arXiv preprint arXiv:2307.05973 (2023)"},{"key":"5_CR17","doi-asserted-by":"crossref","unstructured":"Lan, Z., Yew, Z.J., Lee, G.H.: Robust point cloud based reconstruction of large-scale outdoor scenes. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00992"},{"key":"5_CR18","unstructured":"Lee, J., Im, W., Lee, S., Yoon, S.E.: Diffusion probabilistic models for scene-scale 3d categorical data. arXiv preprint arXiv:2301.00527 (2023)"},{"key":"5_CR19","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: Campus3d: a photogrammetry point cloud benchmark for hierarchical understanding of outdoor scene. In: ACM MM (2020)","DOI":"10.1145\/3394171.3413661"},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"Li, Y., Ma, L., Zhong, Z., Liu, F., Chapman, M.A., Cao, D., Li, J.: Deep learning for lidar point clouds in autonomous driving: a review. In: NeurIPS (2020)","DOI":"10.1109\/TNNLS.2020.3015992"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Li, Y., et\u00a0al.: Deepfusion: lidar-camera deep fusion for multi-modal 3d object detection. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01667"},{"key":"5_CR22","doi-asserted-by":"crossref","unstructured":"Li, Z., Wang, Q., Snavely, N., Kanazawa, A.: Infinitenature-zero: learning perpetual view generation of natural scenes from single images. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19769-7_30"},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Lin, C.H., et al.: Infinicity: infinite-scale city synthesis. arXiv preprint arXiv:2301.09637 (2023)","DOI":"10.1109\/ICCV51070.2023.02085"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Liu, M., et al.: One-2-3-45++: fast single image to 3d objects with consistent multi-view generation and 3d diffusion. arXiv preprint arXiv:2311.07885 (2023)","DOI":"10.1109\/CVPR52733.2024.00960"},{"key":"5_CR25","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"5_CR26","doi-asserted-by":"crossref","unstructured":"Luo, S., Hu, W.: Diffusion probabilistic models for 3d point cloud generation. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00286"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Ma, Q., Yang, J., Tang, S., Black, M.J.: The power of points for modeling humans in clothing. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01079"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Mascaro, R., Teixeira, L., Chli, M.: Diffuser: multi-view 2d-to-3d label diffusion for semantic scene segmentation. In: ICRA (2021)","DOI":"10.1109\/ICRA48506.2021.9561801"},{"key":"5_CR29","unstructured":"Moon, T., Choi, M., Lee, G., Ha, J.W., Lee, J.: Fine-tuning diffusion models with limited data. In: NeurIPS 2022 Workshop on Score-Based Methods (2022)"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Moro, S., Komuro, T.: Generation of virtual reality environment based on 3d scanned indoor physical space. In: ISVC (2021)","DOI":"10.1007\/978-3-030-90439-5_39"},{"key":"5_CR31","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: ICML (2021)"},{"key":"5_CR32","doi-asserted-by":"crossref","unstructured":"\u00d6g\u00fcn, M.N., Kurul, R., Ya\u015far, M.F., Turkoglu, S.A., Avci, \u015e., Yildiz, N.: Effect of leap motion-based 3d immersive virtual reality usage on upper extremity function in ischemic stroke patients. Arquivos de neuro-psiquiatria (2019)","DOI":"10.1590\/0004-282x20190129"},{"key":"5_CR33","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: Dreamfusion: text-to-3d using 2d diffusion. arXiv (2022)"},{"key":"5_CR34","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: Pointnet++: deep hierarchical feature learning on point sets in a metric space. In: NeurIPS (2017)"},{"key":"5_CR35","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"5_CR36","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"5_CR37","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. In: NeurIPS (2022)"},{"key":"5_CR38","unstructured":"Sohl-Dickstein, J., Weiss, E.A., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. arXiv preprint arXiv:1503.03585 (2015)"},{"key":"5_CR39","doi-asserted-by":"crossref","unstructured":"Sra, M., Garrido-Jurado, S., Schmandt, C., Maes, P.: Procedurally generated virtual reality from 3d reconstructed physical space. In: Proceedings of the 22nd ACM Conference on Virtual Reality Software and Technology (2016)","DOI":"10.1145\/2993369.2993372"},{"key":"5_CR40","doi-asserted-by":"crossref","unstructured":"Su, S.Y., Bagautdinov, T., Rhodin, H.: NPC: neural point characters from video. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01359"},{"key":"5_CR41","doi-asserted-by":"crossref","unstructured":"Tang, J., Nie, Y., Markhasin, L., Dai, A., Thies, J., Nie\u00dfner, M.: Diffuscene: scene graph denoising diffusion probabilistic model for generative indoor scene synthesis. arXiv preprint arXiv:2303.14207 (2023)","DOI":"10.1109\/CVPR52733.2024.01938"},{"key":"5_CR42","doi-asserted-by":"publisher","first-page":"126587","DOI":"10.1016\/j.neucom.2023.126587","volume":"553","author":"Y Tang","year":"2023","unstructured":"Tang, Y., He, H., Wang, Y., Mao, Z., Wang, H.: Multi-modality 3d object detection in autonomous driving: a review. Neurocomputing 553, 126587 (2023)","journal-title":"Neurocomputing"},{"issue":"3","key":"5_CR43","doi-asserted-by":"publisher","first-page":"8439","DOI":"10.1109\/LRA.2022.3188435","volume":"7","author":"J Wilson","year":"2022","unstructured":"Wilson, J., et al.: MotionSC: data set and network for real-time semantic mapping in dynamic environments. IEEE Robot. Autom. Lett. 7(3), 8439\u20138446 (2022)","journal-title":"IEEE Robot. Autom. Lett."},{"key":"5_CR44","unstructured":"Wu, P., Jia, X., Chen, L., Yan, J., Li, H., Qiao, Y.: Trajectory-guided control prediction for end-to-end autonomous driving: a simple yet strong baseline. In: NeurIPS (2022)"},{"key":"5_CR45","doi-asserted-by":"crossref","unstructured":"Xie, H., Chen, Z., Hong, F., Liu, Z.: Citydreamer: compositional generative model of unbounded 3d cities. arXiv preprint arXiv:2309.00610 (2023)","DOI":"10.1109\/CVPR52733.2024.00923"},{"key":"5_CR46","unstructured":"Xu, Z., He, Z., Wu, J., Song, S.: Learning 3d dynamic scene representations for robot manipulation. arXiv preprint arXiv:2011.01968 (2020)"},{"key":"5_CR47","unstructured":"Zeng, X., et al.: Lion: latent point diffusion models for 3d shape generation. In: NeurIPS (2022)"},{"key":"5_CR48","unstructured":"Zheng, H., Nie, W., Vahdat, A., Anandkumar, A.: Fast training of diffusion models with masked transformers. arXiv preprint arXiv:2306.09305 (2023)"},{"key":"5_CR49","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Yifan, W., Wetzstein, G., Black, M.J., Hilliges, O.: Pointavatar: deformable point-based head avatars from videos. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02017"},{"key":"5_CR50","doi-asserted-by":"crossref","unstructured":"Zhou, L., Du, Y., Wu, J.: 3d shape generation and completion through point-voxel diffusion. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00577"},{"key":"5_CR51","unstructured":"Zhuang, F., et al.: A comprehensive survey on transfer learning. arXiv preprint arXiv:1911.02685 (2020)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72890-7_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T20:03:40Z","timestamp":1733515420000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72890-7_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,7]]},"ISBN":["9783031728891","9783031728907"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72890-7_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,7]]},"assertion":[{"value":"7 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}