{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,18]],"date-time":"2025-05-18T04:05:59Z","timestamp":1747541159385,"version":"3.40.5"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,5,9]],"date-time":"2025-05-09T00:00:00Z","timestamp":1746748800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,5,9]],"date-time":"2025-05-09T00:00:00Z","timestamp":1746748800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/100014718","name":"Innovative Research Group Project of the National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["KJZD-M202301901"],"award-info":[{"award-number":["KJZD-M202301901"]}],"id":[{"id":"10.13039\/100014718","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Complex Intell. Syst."],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s40747-025-01872-2","type":"journal-article","created":{"date-parts":[[2025,5,9]],"date-time":"2025-05-09T05:21:16Z","timestamp":1746768076000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["STAR-SNR: spatial\u2013temporal adaptive regulation and SNR optimization for few-shot video generation"],"prefix":"10.1007","volume":"11","author":[{"given":"Xian","family":"Yu","sequence":"first","affiliation":[]},{"given":"Jianxun","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Siran","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Hongyu","family":"Yi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,9]]},"reference":[{"key":"1872_CR1","unstructured":"Babaeizadeh M, Saffar MT, Nair S, Levine S, Finn C, Erhan D (2021) Fitvid: overfitting in pixel-level video prediction. arXiv preprint arXiv:2106.13195"},{"key":"1872_CR2","unstructured":"Bahng H, Jahanian A, Sankaranarayanan S, Isola P (2022) Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274,"},{"key":"1872_CR3","doi-asserted-by":"crossref","unstructured":"Blattmann A, Rombach R, Ling H, Dockhorn T, Kim SW, Fidler S, Kreis K (2023) Align your latents: high-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 22563\u201322575","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"1872_CR4","first-page":"16664","volume":"35","author":"S Chen","year":"2022","unstructured":"Chen S, Ge C, Tong Z, Wang J, Song Y, Wang J, Luo P (2022) Adaptformer: adapting vision transformers for scalable visual recognition. Adv Neural Inf Process Syst 35:16664\u201316678","journal-title":"Adv Neural Inf Process Syst"},{"key":"1872_CR5","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal P, Nichol A (2021) Diffusion models beat gans on image synthesis. Adv Neural Inf Process Syst 34:8780\u20138794","journal-title":"Adv Neural Inf Process Syst"},{"key":"1872_CR6","unstructured":"Dosovitskiy A (2020) An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"1872_CR7","doi-asserted-by":"crossref","unstructured":"Esser P, Rombach R, Ommer B (2021) Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 12873\u201312883","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"1872_CR8","doi-asserted-by":"crossref","unstructured":"Esser P, Chiu J, Atighehchian P, Granskog J, Germanidis A (2023) Structure and content-guided video synthesis with diffusion models. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7346\u20137356","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"1872_CR9","unstructured":"Geyer M, Bar-Tal O, Bagon S, Dekel T (2023) Tokenflow: consistent diffusion features for consistent video editing. arXiv preprint arXiv:2307.10373,"},{"issue":"11","key":"1872_CR10","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow I, Pouget-Abadie J, Mirza M, Xu B, Warde-Farley D, Ozair S, Courville A, Bengio Y (2020) Generative adversarial networks. Commun ACM 63(11):139\u2013144","journal-title":"Commun ACM"},{"key":"1872_CR11","first-page":"23968","volume":"34","author":"M Grci\u0107","year":"2021","unstructured":"Grci\u0107 M, Grubi\u0161i\u0107 I, \u0160egvi\u0107 S (2021) Densely connected normalizing flows. Adv Neural Inf Process Syst 34:23968\u201323982","journal-title":"Adv Neural Inf Process Syst"},{"key":"1872_CR12","unstructured":"Guo Y, Yang C, Rao A, Liang Z, Wang Y, Qiao Y, Agrawala M, Lin D, Dai B (2023) Animatediff: animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725"},{"key":"1872_CR13","doi-asserted-by":"crossref","unstructured":"Hang T, Gu S, Li C, Bao J, Chen D, Hu H, Geng X, Guo B (2023) Efficient diffusion training via min-snr weighting strategy. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7441\u20137451","DOI":"10.1109\/ICCV51070.2023.00684"},{"key":"1872_CR14","unstructured":"He J, Zhou C, Ma X, Berg-Kirkpatrick T, Neubig G (2021) Towards a unified view of parameter-efficient transfer learning. arXiv preprint arXiv:2110.04366"},{"key":"1872_CR15","unstructured":"He Y, Qian J, Wang J, Le CX, Hetang C, Lyu Q, Wang W, Yue T (2019) Depth-wise decomposition for accelerating separable convolutions in efficient convolutional neural networks. arXiv preprint arXiv:1910.09455"},{"key":"1872_CR16","unstructured":"Ho J, Salimans T (2022) Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598"},{"key":"1872_CR17","unstructured":"Ho J, Chan W, Saharia C, Whang J, Gao R, Gritsenko A, Kingma DP, Poole B, Norouzi M, Fleet David\u00a0J et\u00a0al (2022) Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303"},{"issue":"47","key":"1872_CR18","first-page":"1","volume":"23","author":"J Ho","year":"2022","unstructured":"Ho J, Saharia C, Chan W, Fleet DJ, Norouzi M, Salimans T (2022) Cascaded diffusion models for high fidelity image generation. J Mach Learn Res 23(47):1\u201333","journal-title":"J Mach Learn Res"},{"key":"1872_CR19","first-page":"8633","volume":"35","author":"J Ho","year":"2022","unstructured":"Ho J, Salimans T, Gritsenko A, Chan W, Norouzi M, Fleet DJ (2022) Video diffusion models. Adv Neural Inf Process Syst 35:8633\u20138646","journal-title":"Adv Neural Inf Process Syst"},{"key":"1872_CR20","unstructured":"Hong S, Seo J, Shin H, Hong S, Kim S (2023) Direct2v: large language models are frame-level directors for zero-shot text-to-video generation. arXiv preprint arXiv:2305.14330"},{"key":"1872_CR21","unstructured":"Houlsby N, Giurgiu A, Jastrzebski S, Morrone B, De\u00a0Laroussilhe Q, Gesmundo A, Attariyan M, Gelly S (2019) Parameter-efficient transfer learning for nlp. In: International conference on machine learning. PMLR, pp 2790\u20132799"},{"key":"1872_CR22","unstructured":"Hu EJ, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W (2021) Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685"},{"key":"1872_CR23","doi-asserted-by":"crossref","unstructured":"Huang H, Feng Y, Shi C, Xu L, Yu J, Yang S (2024) Free-bloom: zero-shot text-to-video generator with llm director and ldm animator. Adv Neural Inf Process Syst 36","DOI":"10.1162\/neco_a_01639"},{"key":"1872_CR24","doi-asserted-by":"publisher","first-page":"1005505","DOI":"10.3389\/feart.2022.1005505","volume":"10","author":"Z Jin","year":"2023","unstructured":"Jin Z, Li X, Yang H, Bangyu W, Zhu Xu (2023) Depthwise separable convolution unet for 3d seismic data interpolation. Front Earth Sci 10:1005505","journal-title":"Front Earth Sci"},{"key":"1872_CR25","unstructured":"Kingma DP (2013) Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114"},{"key":"1872_CR26","doi-asserted-by":"crossref","unstructured":"Lin J, Gan C, Han S (2019) Tsm: temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7083\u20137093","DOI":"10.1109\/ICCV.2019.00718"},{"key":"1872_CR27","unstructured":"Molad E, Horwitz E, Valevski D, Acha AR, Matias Y, Pritch Y, Leviathan Y, Hoshen Y (2023) Dreamix: video diffusion models are general video editors. arXiv preprint arXiv:2302.01329"},{"key":"1872_CR28","unstructured":"Nichol AQ, Dhariwal P (2021) Improved denoising diffusion probabilistic models. In: International conference on machine learning. PMLR, pp 8162\u20138171"},{"key":"1872_CR29","first-page":"26462","volume":"35","author":"J Pan","year":"2022","unstructured":"Pan J, Lin Z, Zhu X, Shao J, Li H (2022) St-adapter: parameter-efficient image-to-video transfer learning. Adv Neural Inf Process Syst 35:26462\u201326477","journal-title":"Adv Neural Inf Process Syst"},{"key":"1872_CR30","unstructured":"Podell D, English Z, Lacey K, Blattmann A, Dockhorn T, M\u00fcller J, Penna J, Rombach R (2023) Sdxl: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952"},{"key":"1872_CR31","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J et\u00a0al (2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning. PMLR, pp 8748\u20138763"},{"key":"1872_CR32","unstructured":"Ramesh A, Pavlov M, Goh G, Gray S, Voss C, Radford A, Chen M, Sutskever I (2021) Zero-shot text-to-image generation. In: International conference on machine learning. PMLR, pp 8821\u20138831"},{"key":"1872_CR33","unstructured":"Ramesh A, Dhariwal P, Nichol A, Chu C, Chen M (2022) Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125"},{"key":"1872_CR34","doi-asserted-by":"crossref","unstructured":"Rombach R, Blattmann A, Lorenz D, Esser P, Ommer B (2022) High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10684\u201310695","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"1872_CR35","doi-asserted-by":"crossref","unstructured":"Ruiz N, Li Y, Jampani V, Pritch Y, Rubinstein M, Aberman K (2023) Dreambooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 22500\u201322510","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"1872_CR36","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia C, Chan W, Saxena S, Li L, Whang J, Denton EL, Ghasemipour K, Gontijo LR, Karagol AB, Salimans T et al (2022) Photorealistic text-to-image diffusion models with deep language understanding. Adv Neural Inf Process Syst 35:36479\u201336494","journal-title":"Adv Neural Inf Process Syst"},{"key":"1872_CR37","doi-asserted-by":"crossref","unstructured":"Sauer A, Schwarz K, Geiger A (2022) Stylegan-xl: scaling stylegan to large diverse datasets. In: ACM SIGGRAPH 2022 conference proceedings, pp 1\u201310","DOI":"10.1145\/3528233.3530738"},{"key":"1872_CR38","unstructured":"Sohn K, Lee H, Yan X (2015) Learning structured output representation using deep conditional generative models. Adv Neural Inf Process Syst 28"},{"key":"1872_CR39","unstructured":"Soni A, Venkataraman S, Chandra A, Fischmeister S, Liang P, Dai B, Yang S (2024) Videoagent: self-improving video generation. arXiv preprint arXiv:2410.10076"},{"key":"1872_CR40","doi-asserted-by":"crossref","unstructured":"Sun C, Myers A, Vondrick C, Murphy K, Schmid C (2019) Videobert: a joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7464\u20137473","DOI":"10.1109\/ICCV.2019.00756"},{"key":"1872_CR41","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"1872_CR42","doi-asserted-by":"crossref","unstructured":"Tumanyan N, Geyer M, Bagon S, Dekel T (2023) Plug-and-play diffusion features for text-driven image-to-image translation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1921\u20131930","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"1872_CR43","unstructured":"Van Den\u00a0Oord A, Vinyals O et\u00a0al (2017) Neural discrete representation learning. Adv Neural Inf Process Syst 30"},{"key":"1872_CR44","unstructured":"Wang X, Yuan H, Zhang S, Chen D, Wang J, Zhang Y, Shen Y, Zhao D, Zhou J (2024) Videocomposer: compositional video synthesis with motion controllability. Adv Neural Inf Process Syst 36"},{"key":"1872_CR45","unstructured":"Wang Z, Oates T (2015) Encoding time series as images for visual inspection and classification using tiled convolutional neural networks. In: Workshops at the twenty-ninth AAAI conference on artificial intelligence"},{"key":"1872_CR46","doi-asserted-by":"crossref","unstructured":"Wu JZ, Ge Y, Wang X, Lei SW, Gu Y, Shi Y, Hsu W, Shan Y, Qie X, Shou MZ (2023) Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7623\u20137633","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"1872_CR47","doi-asserted-by":"crossref","unstructured":"Wu R, Chen L, Yang T, Guo C, Li C, Zhang X (2023) Lamp: learn a motion pattern for few-shot-based video generation. arXiv preprint arXiv:2310.10769","DOI":"10.1109\/CVPR52733.2024.00677"},{"key":"1872_CR48","doi-asserted-by":"crossref","unstructured":"Xing Z, Dai Q, Hu H, Wu Z, Jiang Y-G (2024) Simda: simple diffusion adapter for efficient video generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 7827\u20137839","DOI":"10.1109\/CVPR52733.2024.00748"},{"key":"1872_CR49","unstructured":"Yang M, Du Y, Dai B, Schuurmans D, Tenenbaum JB, Abbeel P (2023) Probabilistic adaptation of text-to-video models. arXiv preprint arXiv:2306.01872"},{"key":"1872_CR50","doi-asserted-by":"crossref","unstructured":"Ye R, Liu F, Zhang L (2018) 3d depthwise convolution: reducing model parameters in 3d vision tasks. arxiv arXiv preprint arXiv:1808.01556","DOI":"10.1007\/978-3-030-18305-9_15"},{"key":"1872_CR51","unstructured":"Yu J, Xu Y, Koh JY, Luong T, Baid G, Wang Z, Vasudevan V, Ku A, Yang Y, Ayan BK et\u00a0al (2022) Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789"},{"key":"1872_CR52","unstructured":"Zhang B, Jin X, Gong W, Xu K, Zhang Z, Wang P, Shen X, Feng J (2023) Multimodal video adapter for parameter efficient video text retrieval. arXiv preprint arXiv:2301.07868"},{"key":"1872_CR53","unstructured":"Zhou D, Wang W, Yan H, Lv W, Zhu Y, Feng J (2022) Magicvideo: efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018"}],"container-title":["Complex &amp; Intelligent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-025-01872-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s40747-025-01872-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s40747-025-01872-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,17]],"date-time":"2025-05-17T11:24:00Z","timestamp":1747481040000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s40747-025-01872-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,9]]},"references-count":53,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["1872"],"URL":"https:\/\/doi.org\/10.1007\/s40747-025-01872-2","relation":{},"ISSN":["2199-4536","2198-6053"],"issn-type":[{"type":"print","value":"2199-4536"},{"type":"electronic","value":"2198-6053"}],"subject":[],"published":{"date-parts":[[2025,5,9]]},"assertion":[{"value":"17 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"283"}}