{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T13:27:38Z","timestamp":1773840458418,"version":"3.50.1"},"publisher-location":"Cham","reference-count":90,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726903","type":"print"},{"value":"9783031726910","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72691-0_18","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T18:04:54Z","timestamp":1730570694000},"page":"313-331","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["Generative Camera Dolly: Extreme Monocular Dynamic Novel View Synthesis"],"prefix":"10.1007","author":[{"given":"Basile","family":"Van Hoorick","sequence":"first","affiliation":[]},{"given":"Rundi","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Ege","family":"Ozguroglu","sequence":"additional","affiliation":[]},{"given":"Kyle","family":"Sargent","sequence":"additional","affiliation":[]},{"given":"Ruoshi","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Pavel","family":"Tokmakov","sequence":"additional","affiliation":[]},{"given":"Achal","family":"Dave","sequence":"additional","affiliation":[]},{"given":"Changxi","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Carl","family":"Vondrick","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"18_CR1","doi-asserted-by":"crossref","unstructured":"Bahmani, S., et al.: 4D-fy: text-to-4D generation using hybrid score distillation sampling. arXiv preprint arXiv:2311.17984 (2023)","DOI":"10.1109\/CVPR52733.2024.00764"},{"key":"18_CR2","doi-asserted-by":"crossref","unstructured":"Bansal, A., Vo, M., Sheikh, Y., Ramanan, D., Narasimhan, S.: 4D visualization of dynamic events from unconstrained multi-view videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5366\u20135375 (2020)","DOI":"10.1109\/CVPR42600.2020.00541"},{"key":"18_CR3","unstructured":"Bar-Tal, O., et\u00a0al.: Lumiere: a space-time diffusion model for video generation. arXiv preprint arXiv:2401.12945 (2024)"},{"issue":"6","key":"18_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417827","volume":"39","author":"M Bemana","year":"2020","unstructured":"Bemana, M., Myszkowski, K., Seidel, H.P., Ritschel, T.: X-fields: implicit neural view-, light-and time-image interpolation. ACM Trans. Graph. (TOG) 39(6), 1\u201315 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"18_CR5","unstructured":"Blattmann, A., et\u00a0al.: Stable video diffusion: scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127 (2023)"},{"key":"18_CR6","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"18_CR7","unstructured":"Brooks, T., et al.: Video generation models as world simulators (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"18_CR8","doi-asserted-by":"crossref","unstructured":"Broxton, M., et al.: Immersive light field video with a layered mesh representation. ACM Trans. Graph. (TOG) 39(4), 86-1 (2020)","DOI":"10.1145\/3386569.3392485"},{"key":"18_CR9","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., Ferrari, V.: COCO-Stuff: thing and stuff classes in context. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"18_CR10","doi-asserted-by":"crossref","unstructured":"Cao, A., Johnson, J.: HexPlane: a fast representation for dynamic scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 130\u2013141 (2023)","DOI":"10.1109\/CVPR52729.2023.00021"},{"key":"18_CR11","doi-asserted-by":"crossref","unstructured":"Chen, R., Chen, Y., Jiao, N., Jia, K.: Fantasia3D: disentangling geometry and appearance for high-quality text-to-3D content creation. arXiv preprint arXiv:2303.13873 (2023)","DOI":"10.1109\/ICCV51070.2023.02033"},{"key":"18_CR12","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3213\u20133223 (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"18_CR13","doi-asserted-by":"crossref","unstructured":"Corona, K., Osterdahl, K., Collins, R., Hoogs, A.: MEVA: a large-scale multiview, multimodal video dataset for activity detection. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1060\u20131068 (2021)","DOI":"10.1109\/WACV48630.2021.00110"},{"key":"18_CR14","doi-asserted-by":"crossref","unstructured":"Du, Y., Zhang, Y., Yu, H.X., Tenenbaum, J.B., Wu, J.: Neural radiance flow for 4D view synthesis and video processing. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 14304\u201314314. IEEE Computer Society (2021)","DOI":"10.1109\/ICCV48922.2021.01406"},{"key":"18_CR15","doi-asserted-by":"crossref","unstructured":"Ehsani, K., Mottaghi, R., Farhadi, A.: SeGAN: segmenting and generating the invisible. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00643"},{"key":"18_CR16","doi-asserted-by":"crossref","unstructured":"Gao, C., Saraf, A., Kopf, J., Huang, J.B.: Dynamic view synthesis from dynamic monocular video. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5712\u20135721 (2021)","DOI":"10.1109\/ICCV48922.2021.00566"},{"key":"18_CR17","unstructured":"Gao, H., Li, R., Tulsiani, S., Russell, B., Kanazawa, A.: Monocular dynamic view synthesis: a reality check. In: Advances in Neural Information Processing Systems, vol. 35, pp. 33768\u201333780 (2022)"},{"key":"18_CR18","doi-asserted-by":"crossref","unstructured":"Ge, S., et al.: Preserve your own correlation: a noise prior for video diffusion models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"18_CR19","unstructured":"Grauman, K., et\u00a0al.: Ego4D: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"key":"18_CR20","unstructured":"Grauman, K., et\u00a0al.: Ego-Exo4D: understanding skilled human activity from first-and third-person perspectives. arXiv preprint arXiv:2311.18259 (2023)"},{"key":"18_CR21","unstructured":"Greff, K., et\u00a0al.: Kubric: a scalable dataset generator. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3749\u20133761 (2022)"},{"key":"18_CR22","doi-asserted-by":"crossref","unstructured":"Guizilini, V., Ambrus, R., Pillai, S., Raventos, A., Gaidon, A.: 3D packing for self-supervised monocular depth estimation. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00256"},{"key":"18_CR23","doi-asserted-by":"crossref","unstructured":"Haque, A., Tancik, M., Efros, A.A., Holynski, A., Kanazawa, A.: Instruct-NeRF2NeRF: editing 3D scenes with instructions. arXiv preprint arXiv:2303.12789 (2023)","DOI":"10.1109\/ICCV51070.2023.01808"},{"key":"18_CR24","unstructured":"Ho, J., et\u00a0al.: Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)"},{"key":"18_CR25","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"18_CR26","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. arXiv abs\/2204.03458 (2022). https:\/\/api.semanticscholar.org\/CorpusID:248006185"},{"key":"18_CR27","doi-asserted-by":"crossref","unstructured":"H\u00f6llein, L., Cao, A., Owens, A., Johnson, J., Nie\u00dfner, M.: Text2Room: extracting textured 3D meshes from 2D text-to-image models. arXiv preprint arXiv:2303.11989 (2023)","DOI":"10.1109\/ICCV51070.2023.00727"},{"key":"18_CR28","doi-asserted-by":"crossref","unstructured":"Kerbl, B., Kopanas, G., Leimk\u00fchler, T., Drettakis, G.: 3D Gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. 42(4) (2023). https:\/\/repo-sam.inria.fr\/fungraph\/3d-gaussian-splatting\/","DOI":"10.1145\/3592433"},{"key":"18_CR29","doi-asserted-by":"crossref","unstructured":"Khirodkar, R., Bansal, A., Ma, L., Newcombe, R., Vo, M., Kitani, K.: Ego-humans: an ego-centric 3D multi-human benchmark. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19807\u201319819 (2023)","DOI":"10.1109\/ICCV51070.2023.01814"},{"key":"18_CR30","doi-asserted-by":"crossref","unstructured":"Li, T., et\u00a0al.: Neural 3D video synthesis from multi-view video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5521\u20135531 (2022)","DOI":"10.1109\/CVPR52688.2022.00544"},{"key":"18_CR31","doi-asserted-by":"crossref","unstructured":"Li, Z., Niklaus, S., Snavely, N., Wang, O.: Neural scene flow fields for space-time view synthesis of dynamic scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6498\u20136508 (2021)","DOI":"10.1109\/CVPR46437.2021.00643"},{"key":"18_CR32","doi-asserted-by":"crossref","unstructured":"Li, Z., Wang, Q., Cole, F., Tucker, R., Snavely, N.: DyniBaR: neural dynamic image-based rendering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4273\u20134284 (2023)","DOI":"10.1109\/CVPR52729.2023.00416"},{"key":"18_CR33","doi-asserted-by":"crossref","unstructured":"Lin, C.H., et al.: Magic3D: high-resolution text-to-3D content creation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 300\u2013309 (2023)","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"18_CR34","doi-asserted-by":"crossref","unstructured":"Ling, H., Kim, S.W., Torralba, A., Fidler, S., Kreis, K.: Align your Gaussians: text-to-4D with dynamic 3D Gaussians and composed diffusion models. arXiv preprint arXiv:2312.13763 (2023)","DOI":"10.1109\/CVPR52733.2024.00819"},{"key":"18_CR35","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Van\u00a0Hoorick, B., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: zero-shot one image to 3D object. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9298\u20139309 (2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"18_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Y.L., et al.: Robust dynamic radiance fields. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13\u201323 (2023)","DOI":"10.1109\/CVPR52729.2023.00010"},{"key":"18_CR37","doi-asserted-by":"crossref","unstructured":"Long, X., et\u00a0al.: Wonder3D: single image to 3D using cross-domain diffusion. arXiv preprint arXiv:2310.15008 (2023)","DOI":"10.1109\/CVPR52733.2024.00951"},{"key":"18_CR38","doi-asserted-by":"crossref","unstructured":"Luiten, J., Kopanas, G., Leibe, B., Ramanan, D.: Dynamic 3D Gaussians: tracking by persistent dynamic view synthesis. arXiv preprint arXiv:2308.09713 (2023)","DOI":"10.1109\/3DV62453.2024.00044"},{"key":"18_CR39","doi-asserted-by":"crossref","unstructured":"Luo, M., Xue, Z., Dimakis, A., Grauman, K.: Put myself in your shoes: lifting the egocentric perspective from exocentric videos. arXiv preprint arXiv:2403.06351 (2024)","DOI":"10.1007\/978-3-031-72920-1_23"},{"key":"18_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1007\/978-3-030-58452-8_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Mildenhall","year":"2020","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 405\u2013421. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_24"},{"issue":"1","key":"18_CR41","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021)","journal-title":"Commun. ACM"},{"key":"18_CR42","doi-asserted-by":"crossref","unstructured":"Ozguroglu, E., et al.: pix2gestalt: amodal segmentation by synthesizing wholes. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00377"},{"key":"18_CR43","doi-asserted-by":"crossref","unstructured":"Park, K., et al.: NeRFies: deformable neural radiance fields. ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00581"},{"key":"18_CR44","doi-asserted-by":"crossref","unstructured":"Park, K., et al.: HyperNeRF: a higher-dimensional representation for topologically varying neural radiance fields. arXiv preprint arXiv:2106.13228 (2021)","DOI":"10.1145\/3478513.3480487"},{"key":"18_CR45","doi-asserted-by":"crossref","unstructured":"Po, R., Wetzstein, G.: Compositional 3D scene generation using locally conditioned diffusion. arXiv preprint arXiv:2303.12218 (2023)","DOI":"10.1109\/3DV62453.2024.00026"},{"key":"18_CR46","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: DreamFusion: text-to-3D using 2D diffusion. arXiv preprint arXiv:2209.14988 (2022)"},{"key":"18_CR47","doi-asserted-by":"crossref","unstructured":"Pumarola, A., Corona, E., Pons-Moll, G., Moreno-Noguer, F.: D-NeRF: neural radiance fields for dynamic scenes. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01018"},{"key":"18_CR48","doi-asserted-by":"crossref","unstructured":"Pumarola, A., Corona, E., Pons-Moll, G., Moreno-Noguer, F.: D-NeRF: neural radiance fields for dynamic scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10318\u201310327 (2021)","DOI":"10.1109\/CVPR46437.2021.01018"},{"key":"18_CR49","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"18_CR50","doi-asserted-by":"crossref","unstructured":"Raistrick, A., et\u00a0al.: Infinite photorealistic worlds using procedural generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12630\u201312641 (2023)","DOI":"10.1109\/CVPR52729.2023.01215"},{"key":"18_CR51","doi-asserted-by":"crossref","unstructured":"Raistrick, A., et\u00a0al.: Infinigen indoors: photorealistic indoor scenes using procedural generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21783\u201321794 (2024)","DOI":"10.1109\/CVPR52733.2024.02058"},{"key":"18_CR52","doi-asserted-by":"crossref","unstructured":"Sargent, K., et\u00a0al.: ZeroNVS: zero-shot 360-degree view synthesis from a single real image. arXiv preprint arXiv:2310.17994 (2023)","DOI":"10.1109\/CVPR52733.2024.00900"},{"key":"18_CR53","unstructured":"Saxena, S., Kar, A., Norouzi, M., Fleet, D.J.: Monocular depth estimation using diffusion models. arXiv preprint arXiv:2302.14816 (2023)"},{"key":"18_CR54","doi-asserted-by":"crossref","unstructured":"Sener, F., et al.: Assembly101: a large-scale multi-view video dataset for understanding procedural activities. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21096\u201321106 (2022)","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"18_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/978-3-030-58517-4_3","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Shamsian","year":"2020","unstructured":"Shamsian, A., Kleinfeld, O., Globerson, A., Chechik, G.: Learning object permanence from video. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12361, pp. 35\u201350. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58517-4_3"},{"key":"18_CR56","unstructured":"Shi, Y., Wang, P., Ye, J., Mai, L., Li, K., Yang, X.: MVDream: multi-view diffusion for 3D generation. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"18_CR57","unstructured":"Singer, U., et\u00a0al.: Make-a-video: text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)"},{"key":"18_CR58","unstructured":"Singer, U., et\u00a0al.: Text-to-4D dynamic scene generation. arXiv preprint arXiv:2301.11280 (2023)"},{"key":"18_CR59","doi-asserted-by":"crossref","unstructured":"Strudel, R., Garcia, R., Laptev, I., Schmid, C.: Segmenter: transformer for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7262\u20137272 (2021)","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"18_CR60","unstructured":"Tokmakov, P., Jabri, A., Li, J., Gaidon, A.: Object permanence emerges in a random walk along memory. In: ICML (2022)"},{"key":"18_CR61","doi-asserted-by":"crossref","unstructured":"Tokmakov, P., Li, J., Burgard, W., Gaidon, A.: Learning to track with object permanence. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01068"},{"key":"18_CR62","doi-asserted-by":"crossref","unstructured":"Tretschk, E., Tewari, A., Golyanik, V., Zollh\u00f6fer, M., Lassner, C., Theobalt, C.: Non-rigid neural radiance fields: reconstruction and novel view synthesis of a dynamic scene from monocular video. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12959\u201312970 (2021)","DOI":"10.1109\/ICCV48922.2021.01272"},{"key":"18_CR63","unstructured":"Tschernezki, V., et al.: Epic fields: marrying 3D geometry and video understanding. arXiv preprint arXiv:2306.08731 (2023)"},{"key":"18_CR64","doi-asserted-by":"crossref","unstructured":"Van\u00a0Hoorick, B., Tendulkar, P., Suris, D., Park, D., Stent, S., Vondrick, C.: Revealing occlusions with 4D neural fields. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3011\u20133021 (2022)","DOI":"10.1109\/CVPR52688.2022.00302"},{"key":"18_CR65","doi-asserted-by":"crossref","unstructured":"Van\u00a0Hoorick, B., Tokmakov, P., Stent, S., Li, J., Vondrick, C.: Tracking through containers and occluders in the wild. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01326"},{"key":"18_CR66","doi-asserted-by":"crossref","unstructured":"Voleti, V., et al.: SV3D: novel multi-view synthesis and 3D generation from a single image using latent video diffusion. arXiv preprint arXiv:2403.12008 (2024)","DOI":"10.1007\/978-3-031-73232-4_25"},{"key":"18_CR67","unstructured":"Walke, H., et al.: BridgeData V2: a dataset for robot learning at scale. In: Conference on Robot Learning (CoRL) (2023)"},{"key":"18_CR68","unstructured":"Wang, C., Eckart, B., Lucey, S., Gallo, O.: Neural trajectory fields for dynamic novel view synthesis. arXiv preprint arXiv:2105.05994 (2021)"},{"key":"18_CR69","unstructured":"Wang, C., et al.: Diffusion priors for dynamic view synthesis from monocular videos. arXiv preprint arXiv:2401.05583 (2024)"},{"key":"18_CR70","doi-asserted-by":"crossref","unstructured":"Wang, H., Du, X., Li, J., Yeh, R.A., Shakhnarovich, G.: Score Jacobian chaining: lifting pretrained 2D diffusion models for 3D generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12619\u201312629 (2023)","DOI":"10.1109\/CVPR52729.2023.01214"},{"key":"18_CR71","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: Fourier plenoctrees for dynamic radiance field rendering in real-time. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13524\u201313534 (2022)","DOI":"10.1109\/CVPR52688.2022.01316"},{"key":"18_CR72","unstructured":"Wang, Z., et al.: ProlificDreamer: high-fidelity and diverse text-to-3D generation with variational score distillation. arXiv preprint arXiv:2305.16213 (2023)"},{"key":"18_CR73","unstructured":"Weissenborn, D., T\u00e4ckstr\u00f6m, O., Uszkoreit, J.: Scaling autoregressive video models. In: ICLR (2020)"},{"key":"18_CR74","unstructured":"Wikipedia contributors: camera dolly\u2014Wikipedia, the free encyclopedia (2024). https:\/\/en.wikipedia.org\/wiki\/Camera_dolly. Accessed 2024"},{"key":"18_CR75","doi-asserted-by":"crossref","unstructured":"Wu, G., et al.: 4D Gaussian splatting for real-time dynamic scene rendering. arXiv preprint arXiv:2310.08528 (2023)","DOI":"10.1109\/CVPR52733.2024.01920"},{"key":"18_CR76","unstructured":"Wu, R., et\u00a0al.: ReconFusion: 3D reconstruction with diffusion priors. arXiv preprint arXiv:2312.02981 (2023)"},{"key":"18_CR77","doi-asserted-by":"crossref","unstructured":"Xian, W., Huang, J.B., Kopf, J., Kim, C.: Space-time neural irradiance fields for free-viewpoint video. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00930"},{"key":"18_CR78","doi-asserted-by":"crossref","unstructured":"Xiao, L., Nouri, S., Hegland, J., Garcia, A.G., Lanman, D.: NeuralPassthrough: learned real-time view synthesis for VR. In: ACM SIGGRAPH 2022 Conference Proceedings, pp.\u00a01\u20139 (2022)","DOI":"10.1145\/3528233.3530701"},{"key":"18_CR79","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: SegFormer: simple and efficient design for semantic segmentation with transformers. In: Advances in Neural Information Processing Systems, vol. 34, pp. 12077\u201312090 (2021)"},{"key":"18_CR80","doi-asserted-by":"crossref","unstructured":"Xie, Y., et al.: Neural fields in visual computing and beyond. In: Computer Graphics Forum, vol.\u00a041, pp. 641\u2013676. Wiley Online Library (2022)","DOI":"10.1111\/cgf.14505"},{"key":"18_CR81","unstructured":"Yoon, J.S., Kim, K., Gallo, O., Park, H.S., Kautz, J.: Novel view synthesis of dynamic scenes with globally coherent depths from a monocular camera. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5336\u20135345 (2020)"},{"key":"18_CR82","doi-asserted-by":"crossref","unstructured":"Yu, F., et al.: BDD100K: a diverse driving dataset for heterogeneous multitask learning (2020)","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"18_CR83","unstructured":"Yu, L., et\u00a0al.: Language model beats diffusion\u2013tokenizer is key to visual generation. In: ICLR (2024)"},{"key":"18_CR84","doi-asserted-by":"crossref","unstructured":"Zhan, X., Pan, X., Dai, B., Liu, Z., Lin, D., Loy, C.C.: Self-supervised scene de-occlusion. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00384"},{"issue":"4","key":"18_CR85","first-page":"1","volume":"40","author":"J Zhang","year":"2021","unstructured":"Zhang, J., et al.: Editable free-viewpoint video using a layered neural representation. ACM Trans. Graph. (TOG) 40(4), 1\u201318 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"18_CR86","unstructured":"Zhang, Q., et al.: SceneWiz3D: towards text-guided 3D scene composition. arXiv preprint arXiv:2312.08885 (2023)"},{"key":"18_CR87","unstructured":"Zhao, X., Colburn, R.A., Ma, F., Bautista, M.\u00c1., Susskind, J.M., Schwing, A.: Pseudo-generalized dynamic view synthesis from a video. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"18_CR88","unstructured":"Zhao, Y., Yan, Z., Xie, E., Hong, L., Li, Z., Lee, G.H.: Animate124: animating one image to 4D dynamic scene. arXiv preprint arXiv:2311.14603 (2023)"},{"key":"18_CR89","doi-asserted-by":"crossref","unstructured":"Zheng, S., et\u00a0al.: Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6881\u20136890 (2021)","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"18_CR90","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Harley, A.W., Shen, B., Wetzstein, G., Guibas, L.J.: PointOdyssey: a large-scale synthetic dataset for long-term point tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19855\u201319865 (2023)","DOI":"10.1109\/ICCV51070.2023.01818"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72691-0_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T18:09:03Z","timestamp":1730570943000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72691-0_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031726903","9783031726910"],"references-count":90,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72691-0_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}