{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T18:32:54Z","timestamp":1762108374614,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031730061"},{"type":"electronic","value":"9783031730078"}],"license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73007-8_11","type":"book-chapter","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T19:02:40Z","timestamp":1727722960000},"page":"176-192","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["SwiftBrush V2: Make Your One-Step Diffusion Model Better Than Its Teacher"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-5217-4558","authenticated-orcid":false,"given":"Trung","family":"Dao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8012-5463","authenticated-orcid":false,"given":"Thuan Hoang","family":"Nguyen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7015-4212","authenticated-orcid":false,"given":"Thanh","family":"Le","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9639-0529","authenticated-orcid":false,"given":"Duc","family":"Vu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9259-420X","authenticated-orcid":false,"given":"Khoi","family":"Nguyen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0973-0889","authenticated-orcid":false,"given":"Cuong","family":"Pham","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3120-4036","authenticated-orcid":false,"given":"Anh","family":"Tran","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,1]]},"reference":[{"key":"11_CR1","unstructured":"Balaji, Y., et\u00a0al.: eDiff-I: text-to-image diffusion models with an ensemble of expert denoisers. arXiv preprint arXiv:2211.01324 (2022)"},{"key":"11_CR2","unstructured":"Betker, J., et\u00a0al.: Improving image generation with better captions. Comput. Sci. 2(3), 8 (2023). https:\/\/cdn.openai.com\/papers\/dall-e-3.pdf"},{"key":"11_CR3","unstructured":"Bohan, O.B.: Madebyollin\/taesd, March 2024. https:\/\/github.com\/madebyollin\/taesd"},{"key":"11_CR4","unstructured":"Chang, H., et\u00a0al.: Muse: text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704 (2023)"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Chang, H., Zhang, H., Jiang, L., Liu, C., Freeman, W.T.: Maskgit: masked generative image transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11315\u201311325 (2022)","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"11_CR6","unstructured":"Check, F.: Images appearing to show Donald Trump arrest created by AI. Reuters, March 2023. https:\/\/www.reuters.com\/article\/idUSL1N35T2TU"},{"key":"11_CR7","unstructured":"Chen, J., et al.: PIXART-$$\\delta $$: fast and controllable image generation with latent consistency models (2024)"},{"key":"11_CR8","unstructured":"Choshen, L., Venezian, E., Slonim, N., Katz, Y.: Fusing finetuned models for better pretraining (2022)"},{"key":"11_CR9","first-page":"19822","volume":"34","author":"M Ding","year":"2021","unstructured":"Ding, M., et al.: Cogview: mastering text-to-image generation via transformers. Adv. Neural. Inf. Process. Syst. 34, 19822\u201319835 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR10","unstructured":"Gu, J., Zhai, S., Zhang, Y., Liu, L., Susskind, J.: BOOT: data-free distillation of denoising diffusion models with bootstrapping. arXiv preprint arXiv:2306.05544 (2023)"},{"key":"11_CR11","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"11_CR12","unstructured":"Hu, E.J., et\u00a0al.: Lora: low-rank adaptation of large language models. In: International Conference on Learning Representations (ICLR) (2021)"},{"key":"11_CR13","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: Openclip, July 2021. https:\/\/doi.org\/10.5281\/zenodo.5143773, if you use this software, please cite it as below","DOI":"10.5281\/zenodo.5143773"},{"key":"11_CR14","unstructured":"Izmailov, P., Podoprikhin, D., Garipov, T., Vetrov, D., Wilson, A.G.: Averaging weights leads to wider optima and better generalization. arXiv preprint arXiv:1803.05407 (2018)"},{"key":"11_CR15","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"11_CR16","unstructured":"Jin, X., Ren, X., Preotiuc-Pietro, D., Cheng, P.: Dataless knowledge fusion by merging weights of language models. arXiv preprint arXiv:2212.09849 (2022)"},{"key":"11_CR17","doi-asserted-by":"crossref","unstructured":"Kang, M., et al.: Scaling up GANs for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10124\u201310134 (2023)","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"11_CR18","doi-asserted-by":"crossref","unstructured":"Kang, M., et al.: Scaling up GANs for text-to-image synthesis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"11_CR19","unstructured":"Kynk\u00e4\u00e4nniemi, T., Karras, T., Laine, S., Lehtinen, J., Aila, T.: Improved precision and recall metric for assessing generative models. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"11_CR20","unstructured":"Li, W., Peng, Y., Zhang, M., Ding, L., Hu, H., Shen, L.: Deep model fusion: a survey. arXiv preprint arXiv:2309.15698 (2023)"},{"key":"11_CR21","unstructured":"Lin, S., Wang, A., Yang, X.: SDXL-lightning: progressive adversarial diffusion distillation (2024)"},{"key":"11_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"11_CR23","unstructured":"Liu, X., Gong, C., Liu, Q.: Flow straight and fast: learning to generate and transfer data with rectified flow. In: International Conference on Learning Representations (2023)"},{"key":"11_CR24","unstructured":"Liu, X., Zhang, X., Ma, J., Peng, J., Liu, Q.: InstaFlow: one step is enough for high-quality diffusion-based text-to-image generation. arXiv preprint arXiv:2309.06380 (2023)"},{"key":"11_CR25","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"11_CR26","unstructured":"Luo, S., Tan, Y., Huang, L., Li, J., Zhao, H.: Latent consistency models: synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378 (2023)"},{"key":"11_CR27","unstructured":"Midjourney: Midjourney. https:\/\/www.midjourney.com"},{"key":"11_CR28","doi-asserted-by":"crossref","unstructured":"Nguyen, T.H., Tran, A.: Swiftbrush: one-step text-to-image diffusion model with variational score distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2024)","DOI":"10.1109\/CVPR52733.2024.00746"},{"key":"11_CR29","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. In: International Conference on Machine Learning (2021)"},{"key":"11_CR30","unstructured":"Pan, J., et al.: JourneyDB: a benchmark for generative image understanding. Adv. Neural Inform. Process. Syst. (2023)"},{"key":"11_CR31","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: Dreamfusion: text-to-3d using 2d diffusion. In: International Conference on Learning Representations (2022)"},{"key":"11_CR32","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Learning Representations, pp. 8748\u20138763. PMLR (2021)"},{"key":"11_CR33","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022). 1(2), 3"},{"key":"11_CR34","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10674\u201310685 (2021)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"11_CR35","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"11_CR36","unstructured":"Roose, K.: AI-Generated Art Won a Prize. Artists Aren\u2019t Happy. N.Y. Times, September 2022. https:\/\/www.nytimes.com\/2022\/09\/02\/technology\/ai-artificial-intelligence-artists.html"},{"key":"11_CR37","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR38","unstructured":"Sauer, A., Karras, T., Laine, S., Geiger, A., Aila, T.: Stylegan-t: unlocking the power of gans for fast large-scale text-to-image synthesis. arXiv preprint arXiv:2301.09515 (2023)"},{"key":"11_CR39","doi-asserted-by":"crossref","unstructured":"Sauer, A., Lorenz, D., Blattmann, A., Rombach, R.: Adversarial diffusion distillation. arXiv preprint arXiv:2311.17042 (2023)","DOI":"10.1007\/978-3-031-73016-0_6"},{"key":"11_CR40","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. Adv. Neural Inform. Process. Syst. (2022)"},{"key":"11_CR41","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: The caltech-ucsd birds-200-2011 dataset (2011)"},{"key":"11_CR42","unstructured":"Wang, Z., et al.: ProlificDreamer: high-fidelity and diverse text-to-3D generation with variational score distillation. Adv. Neural Inform. Process. Syst. (2023)"},{"key":"11_CR43","unstructured":"Wortsman, M., et\u00a0al.: Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time. In: International Conference on Machine Learning, pp. 23965\u201323998. PMLR (2022)"},{"key":"11_CR44","unstructured":"Wu, X., et al.: Human Preference Score v2: a solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341 (2023)"},{"key":"11_CR45","doi-asserted-by":"crossref","unstructured":"Xu, Y., Zhao, Y., Xiao, Z., Hou, T.: Ufogen: you forward once large scale text-to-image generation via diffusion GANs (2023)","DOI":"10.1109\/CVPR52733.2024.00783"},{"key":"11_CR46","unstructured":"Yadav, P., Tam, D., Choshen, L., Raffel, C., Bansal, M.: Resolving interference when merging models. arXiv preprint arXiv:2306.01708 (2023)"},{"key":"11_CR47","doi-asserted-by":"crossref","unstructured":"Yin, T., et al.: One-step diffusion with distribution matching distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2024)","DOI":"10.1109\/CVPR52733.2024.00632"},{"key":"11_CR48","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789 (2022)"},{"key":"11_CR49","unstructured":"Zhang, Y., Hooi, B.: Hipa: enabling one-step text-to-image diffusion models via high-frequency-promoting adaptation. arXiv preprint arXiv:2311.18158 (2023)"},{"key":"11_CR50","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xu, Y., Xiao, Z., Hou, T.: Mobilediffusion: subsecond text-to-image generation on mobile devices (2023)","DOI":"10.1007\/978-3-031-73033-7_13"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73007-8_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,5]],"date-time":"2024-11-05T09:08:50Z","timestamp":1730797730000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73007-8_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"ISBN":["9783031730061","9783031730078"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73007-8_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"assertion":[{"value":"1 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}