{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T09:57:34Z","timestamp":1777888654532,"version":"3.51.4"},"reference-count":114,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01170","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"1-13","source":"Crossref","is-referenced-by-count":0,"title":["Vivid4D: Improving 4D Reconstruction from Monocular Video by Video Inpainting"],"prefix":"10.1109","author":[{"given":"Jiaxin","family":"Huang","sequence":"first","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sheng","family":"Miao","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bangbang","family":"Yang","sequence":"additional","affiliation":[{"name":"ByteDance"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuewen","family":"Ma","sequence":"additional","affiliation":[{"name":"ByteDance"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yiyi","family":"Liao","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","author":"Bai","year":"2024","journal-title":"Syncammaster: Synchronizing multi-camera video generation from diverse viewpoints"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01376"},{"key":"ref3","first-page":"5855","article-title":"Mip-nerf: A multiscale representation for anti-aliasing neural radiance fields","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision","author":"Jonathan","year":"2021"},{"key":"ref4","first-page":"5470","article-title":"Mip-nerf 360: Unbounded anti-aliased neural radiance fields","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"Jonathan","year":"2022"},{"key":"ref5","first-page":"19697","article-title":"Zip-nerf: Anti-aliased grid-based neural radiance fields","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Jonathan","year":"2023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02023"},{"key":"ref7","author":"Blattmann","year":"2023","journal-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets"},{"key":"ref8","author":"Bochkovskii","year":"2024","journal-title":"Depth pro: Sharp monocular metric depth in less than a second"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00021"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01840"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32213"},{"key":"ref12","author":"Chen","year":"2024","journal-title":"Optimizing 3d gaussian splatting for sparse viewpoint scene reconstruction"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02126"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72664-4_21"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3048"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/tvcg.2025.3611489"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657463"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3179575"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612478"},{"key":"ref20","author":"Fan","year":"2024","journal-title":"Instantsplat: Unbounded sparse-view pose-free gaussian splatting in 40 seconds"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555383"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/358669.358692"},{"key":"ref23","first-page":"12479","article-title":"Kplanes: Explicit radiance fields in space, time, and appear-ance","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Fridovich-Keil","year":"2023"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00566"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2447"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2403"},{"key":"ref27","author":"Gu","year":"2024","journal-title":"Advanced video inpainting using optical flow-guided efficient diffusion"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00512"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01246"},{"key":"ref30","author":"Hong","year":"2022","journal-title":"Cogvideo: Large-scale pretraining for text-to-video generation via transformers"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00193"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657428"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_2"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2024.104365"},{"key":"ref36","author":"Kong","year":"2024","journal-title":"Hunyuanvideo: A systematic framework for large video generative models"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32477"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1093\/benz\/9780199773787.article.b00126336"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01963"},{"key":"ref40","author":"Li","year":"2025","journal-title":"Diffueraser: A diffusion model for video inpainting"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00643"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00416"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00981"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01997"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2026.3666733"},{"key":"ref46","author":"Liu","year":"2024","journal-title":"Novel view extrapolation with video diffusion priors"},{"key":"ref47","author":"Liu","year":"2024","journal-title":"Modgs: Dynamic gaussian splatting from causually-captured monocular videos"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3590140.3592851"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01952"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/3dv62453.2024.00044"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00984"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00194"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73404-5_6"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref55","author":"Nan","year":"2024","journal-title":"Openvid-1m: A large-scale high-quality dataset for text-tovideo generation"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00540"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00581"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480487"},{"key":"ref59","article-title":"Sp2360: Sparse-view $360 \\circ$ scene reconstruction using cascaded 2d diffusion priors","volume-title":"ECCV 2024 Workshop on Wild 3D: 3D Modeling, Reconstruction, and Generation in the Wild","author":"Paul","year":"2024"},{"key":"ref60","author":"Poole","year":"2022","journal-title":"Dreamfusion: Text-to-3d using 2d diffusion"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01018"},{"key":"ref62","author":"Ren","year":"2024","journal-title":"L4gm: Large 4d gaussian reconstruction model"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00574"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.445"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_31"},{"key":"ref67","first-page":"80220","article-title":"Genwarp: Single image to novel views with semantic-preserving generative warping","volume":"37","author":"Seo","year":"2025","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02127"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687681"},{"key":"ref70","author":"Sun","year":"2024","journal-title":"Dimensionx: Create any 3d and 4d scenes from a single image with controllable video diffusion"},{"key":"ref71","first-page":"1","article-title":"Lgm: Large multiview gaussian model for high-resolution 3d content creation","volume-title":"European Conference on Computer Vision","author":"Tang","year":"2024"},{"key":"ref72","author":"Unterthiner","year":"2019","journal-title":"Towards accurate generative models of video: A new metric and challenges"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72691-0_18"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72784-9_9"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00832"},{"key":"ref76","author":"Wang","year":"2023","journal-title":"Modelscope text-to-video technical report"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00404"},{"key":"ref78","author":"Wang","year":"2024","journal-title":"Shape of motion: 4d reconstruction from a single video"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00983"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01956"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32847"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01920"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02427"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02036"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00930"},{"key":"ref87","author":"Xiao","year":"2024","journal-title":"Trajectory attention for fine-grained video motion control"},{"key":"ref88","volume-title":"Sparsegs: Real-time 360\u00b0 sparse view synthesis using gaussian splatting","author":"Xiong","year":"2024"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73464-9_10"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-024-00048-9"},{"key":"ref91","author":"Yang","year":"2023","journal-title":"Track anything: Segment anything meets videos"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00798"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"ref94","first-page":"21875","volume":"37","author":"Yang","year":"2025","journal-title":"Depth anything v2. Advances in Neural Information Processing Systems"},{"key":"ref95","author":"Yang","year":"2025","journal-title":"Mtv-inpaint: Multi-task long video inpainting"},{"key":"ref96","author":"Yang","year":"2023","journal-title":"Real-time photorealistic dynamic scene representation and rendering with 4d gaussian splatting"},{"key":"ref97","author":"Yang","year":"2024","journal-title":"Cogvideox: Text-tovideo diffusion models with an expert transformer"},{"key":"ref98","author":"You","year":"2024","journal-title":"Nvssolver: Video diffusion model as zero-shot novel view synthesizer"},{"key":"ref99","author":"Yu","year":"2024","journal-title":"Lm-gaussian: Boost sparse-view 3d gaussian splatting with large model priors"},{"key":"ref100","author":"Mark","year":"2025","journal-title":"Trajectorycrafter: Redirecting camera trajectory for monocular videos via diffusion models"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3613256"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01839"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/wacv61041.2025.00375"},{"key":"ref104","author":"Zhang","year":"2024","journal-title":"Monst3r: A simple approach for estimating geometry in the presence of motion"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73232-4_19"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00684"},{"key":"ref108","author":"Zhao","year":"2024","journal-title":"Stereocrafter: Diffusion-based generation of long and high-fidelity stereoscopic 3d from monocular videos"},{"key":"ref109","author":"Zhao","year":"2024","journal-title":"Genxd: Generating any 3d and 4d scenes"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02016"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00961"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01211"},{"key":"ref113","author":"Zhu","year":"2025","journal-title":"Ar4d: Autoregressive 4d generation from monocular videos"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i10.33203"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444308.pdf?arnumber=11444308","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T04:51:02Z","timestamp":1777611062000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444308\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":114,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01170","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}