{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T14:51:53Z","timestamp":1778079113890,"version":"3.51.4"},"reference-count":90,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100003472","name":"Harbin Institute of Technology","doi-asserted-by":"publisher","award":["ZDXMPY20180109"],"award-info":[{"award-number":["ZDXMPY20180109"]}],"id":[{"id":"10.13039\/501100003472","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003472","name":"Harbin Institute of Technology","doi-asserted-by":"publisher","award":["ZDXMPY20180109"],"award-info":[{"award-number":["ZDXMPY20180109"]}],"id":[{"id":"10.13039\/501100003472","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003472","name":"Harbin Institute of Technology","doi-asserted-by":"publisher","award":["ZDXMPY20180109"],"award-info":[{"award-number":["ZDXMPY20180109"]}],"id":[{"id":"10.13039\/501100003472","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003472","name":"Harbin Institute of Technology","doi-asserted-by":"publisher","award":["ZDXMPY20180109"],"award-info":[{"award-number":["ZDXMPY20180109"]}],"id":[{"id":"10.13039\/501100003472","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003472","name":"Harbin Institute of Technology","doi-asserted-by":"publisher","award":["ZDXMPY20180109"],"award-info":[{"award-number":["ZDXMPY20180109"]}],"id":[{"id":"10.13039\/501100003472","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s00530-025-01927-x","type":"journal-article","created":{"date-parts":[[2025,8,19]],"date-time":"2025-08-19T11:06:14Z","timestamp":1755601574000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Vehiclesim: realistic and 3D-aware video editing with one image for autonomous driving"],"prefix":"10.1007","volume":"31","author":[{"given":"Beike","family":"Yu","sequence":"first","affiliation":[]},{"given":"Dafang","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jiang","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Pengyu","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Yifei","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,19]]},"reference":[{"issue":"28","key":"1927_CR1","doi-asserted-by":"publisher","first-page":"eaaw0863","DOI":"10.1126\/scirobotics.aaw0863","volume":"4","author":"W Li","year":"2019","unstructured":"Li, W., Pan, C., Zhang, R., Ren, J., Ma, Y., Fang, J., Yan, F., Geng, Q., Huang, X., Gong, H.: Aads: Augmented autonomous driving simulation using data-driven algorithms. Sci. Robot. 4(28), eaaw0863 (2019)","journal-title":"Sci. Robot."},{"key":"1927_CR2","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: DreamFusion: Text-to-3D using 2D Diffusion. presented at Int. Conf. Learn. Represent., Kigali, Rwanda. (2023)"},{"key":"1927_CR3","doi-asserted-by":"crossref","unstructured":"Mercier, A., Nakhli, R., Reddy, M., Yasarla, R., Cai, H., Porikli, F., Berger, G.: Hexagen3d: Stablediffusion is just one step away from fast and diverse text-to-3d generation. arXiv preprints, arXiv2401.07727 (2024)","DOI":"10.1109\/WACV61041.2025.00129"},{"key":"1927_CR4","doi-asserted-by":"crossref","unstructured":"Lin, C.-H., Gao, J., Tang, L., Takikawa, T., Zeng, X., Huang, X., Kreis, K., Fidler, S., Liu, M.-Y., Lin, T.-Y.: Magic3d: High-resolution text-to-3d content creation. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 300-309 (2023)","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"1927_CR5","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 10684-10695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"1927_CR6","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E.L., Ghasemipour, K., Gontijo Lopes, R., Karagol Ayan, B., Salimans, T.: Photorealistic text-to-image diffusion models with deep language understanding. Proc. Adv. Neural Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"1927_CR7","unstructured":"Liu, Y., Lin, C., Zeng, Z., Long, X., Liu, L., Komura, T., Wang, W.: SyncDreamer: Generating Multiview-consistent Images from a Single-view Image. presented at Int. Conf. Learn. Represent., Vienna, Austria. (2024)"},{"key":"1927_CR8","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Van\u00a0Hoorick, B., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: Zero-shot one image to 3d object. In: Proc. IEEE\/CVF Int. Conf. Comput. Vis., 9298-9309 (2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"issue":"1","key":"1927_CR9","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: Nerf: Representing scenes as neural radiance fields for view synthesis. Commun. Acm 65(1), 99\u2013106 (2021)","journal-title":"Commun. Acm"},{"key":"1927_CR10","doi-asserted-by":"crossref","unstructured":"Ost, J., Mannan, F., Thuerey, N., Knodt, J., Heide, F.: Neural scene graphs for dynamic scenes. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 2856-2865 (2021)","DOI":"10.1109\/CVPR46437.2021.00288"},{"key":"1927_CR11","unstructured":"Song, Y., Kong, C., Lee, S., Kwak, N., Lee, J.: Towards efficient neural scene graphs by learning consistency fields. arXiv preprints, arXiv2210.04127 (2022)"},{"key":"1927_CR12","doi-asserted-by":"crossref","unstructured":"Tancik, M., Casser, V., Yan, X., Pradhan, S., Mildenhall, B., Srinivasan, P.P., Barron, J.T., Kretzschmar, H.: Block-nerf: Scalable large scene neural view synthesis. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 8248-8258 (2022)","DOI":"10.1109\/CVPR52688.2022.00807"},{"key":"1927_CR13","doi-asserted-by":"crossref","unstructured":"Turki, H., Ramanan, D., Satyanarayanan, M.: Mega-nerf: Scalable construction of large-scale nerfs for virtual fly-throughs. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 12922-12931 (2022)","DOI":"10.1109\/CVPR52688.2022.01258"},{"key":"1927_CR14","doi-asserted-by":"crossref","unstructured":"Wang, Z., Shen, T., Gao, J., Huang, S., Munkberg, J., Hasselgren, J., Gojcic, Z., Chen, W., Fidler, S.: Neural fields meet explicit geometric representations for inverse rendering of urban scenes. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 8370-8380 (2023)","DOI":"10.1109\/CVPR52729.2023.00809"},{"key":"1927_CR15","unstructured":"Zhenxing, M., Xu, D.: Switch-nerf: Learning scene decomposition with mixture of experts for large-scale neural radiance fields. presented at Int. Conf. Learn. Represent.. (2022)"},{"issue":"4","key":"1927_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592433","volume":"42","author":"B Kerbl","year":"2023","unstructured":"Kerbl, B., Kopanas, G., Leimk\u00fchler, T., Drettakis, G.: 3d gaussian splatting for real-time radiance field rendering. Acm Trans. Graph. 42(4), 1\u201314 (2023)","journal-title":"Acm Trans. Graph."},{"key":"1927_CR17","doi-asserted-by":"crossref","unstructured":"Luiten, J., Kopanas, G., Leibe, B., Ramanan, D.: Dynamic 3d gaussians: Tracking by persistent dynamic view synthesis. In: Proc. Int. Conf. 3D Vis., 800-809 (2024)","DOI":"10.1109\/3DV62453.2024.00044"},{"key":"1927_CR18","doi-asserted-by":"crossref","unstructured":"Wu, G., Yi, T., Fang, J., Xie, L., Zhang, X., Wei, W., Liu, W., Tian, Q., Wang, X.: 4d gaussian splatting for real-time dynamic scene rendering. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 20310-20320 (2024)","DOI":"10.1109\/CVPR52733.2024.01920"},{"key":"1927_CR19","doi-asserted-by":"crossref","unstructured":"Yang, Z., Gao, X., Zhou, W., Jiao, S., Zhang, Y., Jin, X.: Deformable 3d gaussians for high-fidelity monocular dynamic scene reconstruction. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 20331-20341 (2024)","DOI":"10.1109\/CVPR52733.2024.01922"},{"key":"1927_CR20","unstructured":"Blattmann, A., Dockhorn, T., Kulal, S., Mendelevitch, D., Kilian, M., Lorenz, D., Levi, Y., English, Z., Voleti, V., Letts, A.: Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprints, arXiv2311.15127 (2023)"},{"key":"1927_CR21","doi-asserted-by":"crossref","unstructured":"Chen, X., Huang, L., Liu, Y., Shen, Y., Zhao, D., Zhao, H.: Anydoor: Zero-shot object-level image customization. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 6593-6602 (2024)","DOI":"10.1109\/CVPR52733.2024.00630"},{"key":"1927_CR22","doi-asserted-by":"crossref","unstructured":"Yang, B., Gu, S., Zhang, B., Zhang, T., Chen, X., Sun, X., Chen, D., Wen, F.: Paint by example: Exemplar-based image editing with diffusion models. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 18381-18391 (2023)","DOI":"10.1109\/CVPR52729.2023.01763"},{"key":"1927_CR23","doi-asserted-by":"crossref","unstructured":"Avrahami, O., Aberman, K., Fried, O., Cohen-Or, D., Lischinski, D.: Break-a-scene: Extracting multiple concepts from a single image. In: Proc. Conf. Comput.Graph. Interact. Techn. Asia, 1-12 (2023)","DOI":"10.1145\/3610548.3618154"},{"key":"1927_CR24","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J.: Learning transferable visual models from natural language supervision. In: Proc. Int. Conf. Mach. Learn., 8748-8763 (2021)"},{"key":"1927_CR25","unstructured":"Oquab, M., Darcet, T., Moutakanni, T., Vo, H., Szafraniec, M., Khalidov, V., Fernandez, P., Haziza, D., Massa, F., El-Nouby, A.: Dinov2: Learning robust visual features without supervision. Transact. Mach. Learn. Res., 1-31 (2024)"},{"key":"1927_CR26","unstructured":"Dosovitskiy, A., Lucas, B., Alexander, K., Dirk, W., Xiaohua, Z., Thomas, U., Mostafa, D.: An image is worth 16x16 words: Transformers for image recognition at scale. presented at Int. Conf. Learn. Represent. (2021)"},{"key":"1927_CR27","unstructured":"Gu, A., Dao, T.: Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprints, arXiv2312.00752 (2023)"},{"key":"1927_CR28","first-page":"35799","volume":"36","author":"M Deitke","year":"2024","unstructured":"Deitke, M., Liu, R., Wallingford, M., Ngo, H., Michel, O., Kusupati, A., Fan, A., Laforte, C., Voleti, V., Gadre, S.Y.: Objaverse-xl: A universe of 10m+ 3d objects. Proc. Adv. Neural Inf. Process. Syst. 36, 35799\u2013813 (2024)","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"1927_CR29","unstructured":"Shi, R., Chen, H., Zhang, Z., Liu, M., Xu, C., Wei, X., Chen, L., Zeng, C., Su, H.: Zero123++: a single image to consistent multi-view diffusion base model. arXiv preprints, arXiv2310.15110 (2023)"},{"key":"1927_CR30","unstructured":"Shi, Y., Wang, P., Ye, J., Mai, L., Li, K., Yang, X.: MVDream: Multi-view Diffusion for 3D Generation. presented at Int. Conf. Learn. Represent. Vienna, Austria. (2024)"},{"key":"1927_CR31","doi-asserted-by":"crossref","unstructured":"Ye, J., Wang, P., Li, K., Shi, Y., Wang, H.: Consistent-1-to-3: Consistent image to 3d view synthesis via geometry-aware diffusion models. In: Proc. Int. Conf. 3D Vis., 664-674 (2024)","DOI":"10.1109\/3DV62453.2024.00027"},{"key":"1927_CR32","doi-asserted-by":"crossref","unstructured":"Tseng, H.-Y., Li, Q., Kim, C., Alsisan, S., Huang, J.-B., Kopf, J.: Consistent view synthesis with pose-guided diffusion models. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 16773-16783 (2023)","DOI":"10.1109\/CVPR52729.2023.01609"},{"key":"1927_CR33","doi-asserted-by":"crossref","unstructured":"Liu, M., Shi, R., Chen, L., Zhang, Z., Xu, C., Wei, X., Chen, H., Zeng, C., Gu, J., Su, H.: One-2-3-45++: Fast single image to 3d objects with consistent multi-view generation and 3d diffusion. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 10072-10083 (2024)","DOI":"10.1109\/CVPR52733.2024.00960"},{"key":"1927_CR34","first-page":"22226","volume":"36","author":"M Liu","year":"2024","unstructured":"Liu, M., Xu, C., Jin, H., Chen, L., Varma, T., Xu, Z., Su, H.: One-2-3-45 Any single image to 3d mesh in 45 seconds without per-shape optimization. Proc. Adv. Neural Inf. Process. Syst. 36, 22226\u201346 (2024)","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"1927_CR35","doi-asserted-by":"crossref","unstructured":"Yang, J., Cheng, Z., Duan, Y., Ji, P., Li, H.: Consistnet: Enforcing 3d consistency for multi-view images diffusion. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 7079-7088 (2024)","DOI":"10.1109\/CVPR52733.2024.00676"},{"key":"1927_CR36","doi-asserted-by":"crossref","unstructured":"Kwak, J.-g., Dong, E., Jin, Y., Ko, H., Mahajan, S., Yi, K.M.: Vivid-1-to-3: Novel view synthesis with video diffusion models. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 6775-6785 (2024)","DOI":"10.1109\/CVPR52733.2024.00647"},{"key":"1927_CR37","unstructured":"Melas-Kyriazi, L., Laina, I., Rupprecht, C., Neverova, N., Vedaldi, A., Gafni, O., Kokkinos, F.: IM-3D: Iterative Multiview Diffusion and Reconstruction for High-Quality 3D Generation. presented at Int. Conf. Mach. Learn., Vienna, Austria. (2024)"},{"key":"1927_CR38","doi-asserted-by":"crossref","unstructured":"Voleti, V., Yao, C.-H., Boss, M., Letts, A., Pankratz, D., Tochilkin, D., Laforte, C., Rombach, R., Jampani, V.: Sv3d: Novel multi-view synthesis and 3d generation from a single image using latent video diffusion. In: Proc. Eur. Conf. Comput. Vis., 439-457 (2025)","DOI":"10.1007\/978-3-031-73232-4_25"},{"key":"1927_CR39","doi-asserted-by":"crossref","unstructured":"Pandey, K., Guerrero, P., Gadelha, M., Hold-Geoffroy, Y., Singh, K., Mitra, N.J.: Diffusion handles enabling 3d edits for diffusion models by lifting activations to 3d. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 7695-7704 (2024)","DOI":"10.1109\/CVPR52733.2024.00735"},{"key":"1927_CR40","doi-asserted-by":"crossref","unstructured":"Sargent, K., Li, Z., Shah, T., Herrmann, C., Yu, H.-X., Zhang, Y., Chan, E.R., Lagun, D., Fei-Fei, L., Sun, D.: Zeronvs: Zero-shot 360-degree view synthesis from a single image. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 9420-9429 (2024)","DOI":"10.1109\/CVPR52733.2024.00900"},{"key":"1927_CR41","doi-asserted-by":"crossref","unstructured":"Yenphraphai, J., Pan, X., Liu, S., Panozzo, D., Xie, S.: Image sculpting: Precise object editing with 3d geometry control. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 4241-4251 (2024)","DOI":"10.1109\/CVPR52733.2024.00406"},{"key":"1927_CR42","doi-asserted-by":"crossref","unstructured":"Bhat, S.F., Mitra, N., Wonka, P.: Loosecontrol: Lifting controlnet for generalized depth conditioning. In: Proc. Conf. Comput.Graph. Interact. Techn., 1-11 (2024)","DOI":"10.1145\/3641519.3657525"},{"key":"1927_CR43","doi-asserted-by":"crossref","unstructured":"Barron, J.T., Mildenhall, B., Verbin, D., Srinivasan, P.P., Hedman, P.: Mip-nerf 360: Unbounded anti-aliased neural radiance fields. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 5470-5479 (2022)","DOI":"10.1109\/CVPR52688.2022.00539"},{"key":"1927_CR44","doi-asserted-by":"crossref","unstructured":"Barron, J.T., Mildenhall, B., Verbin, D., Srinivasan, P.P., Hedman, P.: Zip-nerf: Anti-aliased grid-based neural radiance fields. In: Proc. IEEE\/CVF Int. Conf. Comput. Vis., 19697-19705 (2023)","DOI":"10.1109\/ICCV51070.2023.01804"},{"issue":"4","key":"1927_CR45","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530127","volume":"41","author":"T M\u00fcller","year":"2022","unstructured":"M\u00fcller, T., Evans, A., Schied, C., Keller, A.: Instant neural graphics primitives with a multiresolution hash encoding. Acm Trans. Graph. 41(4), 1\u201315 (2022)","journal-title":"Acm Trans. Graph."},{"key":"1927_CR46","doi-asserted-by":"crossref","unstructured":"Wang, P., Liu, Y., Chen, Z., Liu, L., Liu, Z., Komura, T., Theobalt, C., Wang, W.: F2-nerf: Fast neural radiance field training with free camera trajectories. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 4150-4159 (2023)","DOI":"10.1109\/CVPR52729.2023.00404"},{"key":"1927_CR47","doi-asserted-by":"crossref","unstructured":"Rematas, K., Liu, A., Srinivasan, P.P., Barron, J.T., Tagliasacchi, A., Funkhouser, T., Ferrari, V.: Urban radiance fields. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 12932-12942 (2022)","DOI":"10.1109\/CVPR52688.2022.01259"},{"key":"1927_CR48","unstructured":"Xie, Z., Zhang, J., Li, W., Zhang, F., Zhang, L.: S-nerf: Neural radiance fields for street views. arXiv preprints, arXiv2303.00749 (2023)"},{"key":"1927_CR49","doi-asserted-by":"crossref","unstructured":"Yang, Z., Chen, Y., Wang, J., Manivasagam, S., Ma, W.-C., Yang, A.J., Urtasun, R.: Unisim: A neural closed-loop sensor simulator. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 1389-1399 (2023)","DOI":"10.1109\/CVPR52729.2023.00140"},{"key":"1927_CR50","unstructured":"Yang, J., Ivanovic, B., Litany, O., Weng, X., Kim, S.W., Li, B., Che, T., Xu, D., Fidler, S., Pavone, M.: EmerNeRF: Emergent Spatial-Temporal Scene Decomposition via Self-Supervision. presented at Int. Conf. Learn. Represent. Vienna, Austria. (2024)"},{"key":"1927_CR51","doi-asserted-by":"crossref","unstructured":"Zhou, X., Lin, Z., Shan, X., Wang, Y., Sun, D., Yang, M.-H.: Drivinggaussian: Composite gaussian splatting for surrounding dynamic autonomous driving scenes. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 21634-21643 (2024)","DOI":"10.1109\/CVPR52733.2024.02044"},{"key":"1927_CR52","doi-asserted-by":"crossref","unstructured":"Yan, Y., Lin, H., Zhou, C., Wang, W., Sun, H., Zhan, K., Lang, X., Zhou, X., Peng, S.: Street gaussians for modeling dynamic urban scenes. arXiv preprints, arXiv2401.01339 (2024)","DOI":"10.1007\/978-3-031-73464-9_10"},{"key":"1927_CR53","unstructured":"Fischer, T., Kulhanek, J., Bul\u00f2, S.R., Porzi, L., Pollefeys, M., Kontschieder, P.: Dynamic 3d gaussian fields for urban areas. arXiv preprints, arXiv2406.03175 (2024)"},{"key":"1927_CR54","unstructured":"Huang, N., Wei, X., Zheng, W., An, P., Lu, M., Zhan, W., Tomizuka, M., Keutzer, K., Zhang, S.: S3gaussian: Self-supervised street gaussians for autonomous driving. arXiv preprints, arXiv2405.20323 (2024)"},{"key":"1927_CR55","doi-asserted-by":"crossref","unstructured":"Shen, X., Li, X., Elhoseiny, M.: Mostgan-v: Video generation with temporal motion styles. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 5652-5661 (2023)","DOI":"10.1109\/CVPR52729.2023.00547"},{"key":"1927_CR56","unstructured":"Tian, Y., Ren, J., Chai, M., Olszewski, K., Peng, X., Metaxas, D.N., Tulyakov, S.: A good image generator is what you need for high-resolution video synthesis. presented at Int. Conf. Learn. Represent. (2021)"},{"key":"1927_CR57","doi-asserted-by":"crossref","unstructured":"Ge, S., Hayes, T., Yang, H., Yin, X., Pang, G., Jacobs, D., Huang, J.-B., Parikh, D.: Long video generation with time-agnostic vqgan and time-sensitive transformer. In: Proc. Eur. Conf. Comput. Vis., 102-118 (2022)","DOI":"10.1007\/978-3-031-19790-1_7"},{"key":"1927_CR58","unstructured":"Hong, W., Ding, M., Zheng, W., Liu, X., Tang, J.: Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprints, arXiv2205.15868 (2022)"},{"key":"1927_CR59","first-page":"14042","volume":"34","author":"G Le Moing","year":"2021","unstructured":"Le Moing, G., Ponce, J., Schmid, C.: Ccvs: Context-aware controllable video synthesis. Proc. Adv. Neural Inf. Process. Syst. 34, 14042\u201314055 (2021)","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"1927_CR60","unstructured":"Ho, J., Chan, W., Saharia, C., Whang, J., Gao, R., Gritsenko, A., Kingma, D.P., Poole, B., Norouzi, M., Fleet, D.J.: Imagen video: High definition video generation with diffusion models. arXiv preprints, arXiv2210.02303 (2022)"},{"key":"1927_CR61","unstructured":"Singer, U., Polyak, A., Hayes, T., Yin, X., An, J., Zhang, S., Hu, Q., Yang, H., Ashual, O., Gafni, O.: Make-A-Video: Text-to-Video Generation without Text-Video Data. presented at Int. Conf. Learn. Represent. Kigali, Rwanda. (2023)"},{"key":"1927_CR62","first-page":"8633","volume":"35","author":"J Ho","year":"2022","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. Proc. Adv. Neural Inf. Process. Syst. 35, 8633\u20138646 (2022)","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"1927_CR63","unstructured":"Zhou, D., Wang, W., Yan, H., Lv, W., Zhu, Y., Feng, J.: Magicvideo: Efficient video generation with latent diffusion models. arXiv preprints, arXiv2211.11018 (2022)"},{"key":"1927_CR64","doi-asserted-by":"crossref","unstructured":"Blattmann, A., Rombach, R., Ling, H., Dockhorn, T., Kim, S.W., Fidler, S., Kreis, K.: Align your latents: High-resolution video synthesis with latent diffusion models. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 22563-22575 (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"1927_CR65","unstructured":"Wang, J., Yuan, H., Chen, D., Zhang, Y., Wang, X., Zhang, S.: Modelscope text-to-video technical report. arXiv preprints, arXiv2308.06571 (2023)"},{"key":"1927_CR66","first-page":"7594","volume":"36","author":"X Wang","year":"2024","unstructured":"Wang, X., Yuan, H., Zhang, S., Chen, D., Wang, J., Zhang, Y., Shen, Y., Zhao, D., Zhou, J.: Videocomposer: Compositional video synthesis with motion controllability. Proc. Adv. Neural Inf. Process. Syst. 36, 7594\u2013611 (2024)","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"1927_CR67","unstructured":"Zhang, Y., Wei, Y., Jiang, D., ZHANG, X., Zuo, W., Tian, Q.: ControlVideo: Training-free Controllable Text-to-video Generation. presented at Int. Conf. Learn. Represent. Vienna, Austria. (2024)"},{"key":"1927_CR68","unstructured":"Chen, W., Ji, Y., Wu, J., Wu, H., Xie, P., Li, J., Xia, X., Xiao, X., Lin, L.: Control-a-video: Controllable text-to-video generation with diffusion models. arXiv preprints, arXiv2305.13840 (2023)"},{"key":"1927_CR69","unstructured":"Zhang, D.J., Li, D., Le, H., Shou, M.Z., Xiong, C., Sahoo, D.: Moonshot: Towards controllable video generation and editing with multimodal conditions. arXiv preprints, arXiv2401.01827 (2024)"},{"issue":"1","key":"1927_CR70","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1115\/1.3662552","volume":"82","author":"RE Kalman","year":"1960","unstructured":"Kalman, R.E.: A new approach to linear filtering and prediction problems. J. Basic Eng. 82(1), 35\u201345 (1960)","journal-title":"J. Basic Eng."},{"key":"1927_CR71","doi-asserted-by":"crossref","unstructured":"Oshima, Y., Taniguchi, S., Suzuki, M., Matsuo, Y.: Ssm meets video diffusion models: Efficient video generation with structured state spaces. arXiv preprints, arXiv2403.07711 (2024)","DOI":"10.2139\/ssrn.4999610"},{"key":"1927_CR72","doi-asserted-by":"crossref","unstructured":"Park, J., Kim, H.-S., Ko, K., Kim, M., Kim, C.: Videomamba: Spatio-temporal selective state space model. In: Proc. Eur. Conf. Comput. Vis., 1-18 (2025)","DOI":"10.1007\/978-3-031-72698-9_1"},{"key":"1927_CR73","doi-asserted-by":"crossref","unstructured":"Li, S., Singh, H., Grover, A.: Mamba-nd: Selective state space modeling for multi-dimensional data. In: Proc. Eur. Conf. Comput. Vis., 75-92 (2025)","DOI":"10.1007\/978-3-031-73414-4_5"},{"key":"1927_CR74","unstructured":"Gao, Y., Huang, J., Sun, X., Jie, Z., Zhong, Y., Ma, L.: Matten: Video generation with mamba-attention. arXiv preprints, arXiv2405.03025 (2024)"},{"key":"1927_CR75","doi-asserted-by":"crossref","unstructured":"Li, K., Li, X., Wang, Y., He, Y., Wang, Y., Wang, L., Qiao, Y.: Videomamba: State space model for efficient video understanding. In: Proc. Eur. Conf. Comput. Vis., 237-255 (2025)","DOI":"10.1007\/978-3-031-73347-5_14"},{"key":"1927_CR76","doi-asserted-by":"crossref","unstructured":"Hu, V.T., Baumann, S.A., Gui, M., Grebenkova, O., Ma, P., Fischer, J., Ommer, B.: Zigma: A dit-style zigzag mamba diffusion model. In: Proc. Eur. Conf. Comput. Vis., 148\u2013166 (2025)","DOI":"10.1007\/978-3-031-72664-4_9"},{"key":"1927_CR77","doi-asserted-by":"crossref","unstructured":"Li, K., Li, X., Wang, Y., He, Y., Wang, Y., Wang, L., Qiao, Y.: Videomamba: State space model for efficient video understanding. In: Proc. Eur. Conf. Comput. Vis., 237\u2013255 (2024)","DOI":"10.1007\/978-3-031-73347-5_14"},{"key":"1927_CR78","doi-asserted-by":"crossref","unstructured":"Han, H., Li, Y., Wu, M.: Os-mamba: Overall scanning mamba for building damage assessment using multi-phase satellite imagery. IEICE Transactions on Information and Systems advpub, 2025EDL8004 (2025)","DOI":"10.1587\/transinf.2025EDL8004"},{"key":"1927_CR79","first-page":"1097","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Proc. Adv. Neural Inf. Process. Syst. 25, 1097\u20131105 (2012)","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"1927_CR80","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprints, arXiv1312.6114 (2013)"},{"key":"1927_CR81","doi-asserted-by":"crossref","unstructured":"Xu, Y., Chai, M., Shi, Z., Peng, S., Skorokhodov, I., Siarohin, A., Yang, C., Shen, Y., Lee, H.-Y., Zhou, B.: Discoscene: Spatially disentangled generative radiance fields for controllable 3d-aware scene synthesis. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 4402-4412 (2023)","DOI":"10.1109\/CVPR52729.2023.00428"},{"key":"1927_CR82","first-page":"76289","volume":"37","author":"Z Wu","year":"2024","unstructured":"Wu, Z., Rubanova, Y., Kabra, R., Hudson, D.A., Gilitschenski, I., Aytar, Y., Steenkiste, S., Allen, K.R., Kipf, T.: Neural assets: 3d-aware multi-object scene synthesis with image diffusion models. Proc. Adv. Neural Inf. Process. Syst. 37, 76289\u201376318 (2024)","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"1927_CR83","unstructured":"Michel, O., Bhattad, A., VanderBilt, E., Krishna, R., Kembhavi, A., Gupta, T.: Object 3dit: Language-guided 3d-aware Image Editing"},{"key":"1927_CR84","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: Proc. IEEE\/CVF Int. Conf. Comput. Vis., 2961-2969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"1927_CR85","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 22500-22510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"1927_CR86","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., 586-595 (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"1927_CR87","unstructured":"Unterthiner, T., Van\u00a0Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: Towards accurate generative models of video: A new metric and challenges. arXiv preprints, arXiv1812.01717 (2018)"},{"key":"1927_CR88","first-page":"6626","volume":"30","author":"M Heusel","year":"2017","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. Proc. Adv. Neural Inf. Process. Syst. 30, 6626\u20136637 (2017)","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"1927_CR89","unstructured":"Loshchilov, I., Hutter, F.: Decoupled Weight Decay Regularization. presented at Int. Conf. Learn. Represent. New Orleans, USA. (2019)"},{"key":"1927_CR90","doi-asserted-by":"crossref","unstructured":"Chen, X., Liu, Z., Chen, M., Feng, Y., Liu, Y., Shen, Y., Zhao, H.: Livephoto: Real image animation with text-guided motion control. In: Proc. Eur. Conf. Comput. Vis., 475-491 (2025)","DOI":"10.1007\/978-3-031-72649-1_27"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01927-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01927-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01927-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T09:05:07Z","timestamp":1757927107000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01927-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8]]},"references-count":90,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["1927"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01927-x","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8]]},"assertion":[{"value":"7 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"316"}}