{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:18:16Z","timestamp":1775578696229,"version":"3.50.1"},"publisher-location":"Cham","reference-count":54,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031734106","type":"print"},{"value":"9783031734113","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73411-3_5","type":"book-chapter","created":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T20:08:00Z","timestamp":1732306080000},"page":"74-91","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":85,"title":["PIXART-$$\\Sigma $$: Weak-to-Strong Training of\u00a0Diffusion Transformer for\u00a04K Text-to-Image Generation"],"prefix":"10.1007","author":[{"given":"Junsong","family":"Chen","sequence":"first","affiliation":[]},{"given":"Chongjian","family":"Ge","sequence":"additional","affiliation":[]},{"given":"Enze","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Yue","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Lewei","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Xiaozhe","family":"Ren","sequence":"additional","affiliation":[]},{"given":"Zhongdao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Ping","family":"Luo","sequence":"additional","affiliation":[]},{"given":"Huchuan","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Zhenguo","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,23]]},"reference":[{"key":"5_CR1","unstructured":"Aesthetic predictor (2023). https:\/\/github.com\/christophschuhmann\/improved-aesthetic-predictor"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Bao, F., et al.: All are worth words: a ViT backbone for diffusion models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02171"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Chandra, A., T\u00fcnnermann, L., L\u00f6fstedt, T., Gratz, R.: Transformer-based deep learning for predicting protein properties in the life sciences. eLife Sciences Publications, Ltd (2023)","DOI":"10.7554\/eLife.82819"},{"key":"5_CR4","doi-asserted-by":"crossref","unstructured":"Chen, J., et al.: PixArt-$$\\alpha $$: Fast training of diffusion transformer for photorealistic text-to-image synthesis. In: ICLR (2024)","DOI":"10.1007\/978-3-031-73411-3_5"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: ShareGPT4V: improving large multi-modal models with better captions. In: arXiv (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"5_CR6","unstructured":"Chen, S., et al.: GenTron: Delving deep into diffusion transformers for image and video generation. arXiv preprint arXiv:2312.04557 (2023)"},{"key":"5_CR7","unstructured":"Chen, T., Cheng, Y., Gan, Z., Yuan, L., Zhang, L., Wang, Z.: Chasing sparsity in vision transformers: an end-to-end exploration. In: NeurIPS (2021)"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Chen, X., Liu, Z., Tang, H., Yi, L., Zhao, H., Han, S.: SparseViT: revisiting activation sparsity for efficient high-resolution vision transformer. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00205"},{"key":"5_CR9","unstructured":"Choromanski, K., et\u00a0al.: Rethinking attention with performers. In: ICLR (2021)"},{"key":"5_CR10","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. In: arXiv (2022)"},{"key":"5_CR11","doi-asserted-by":"crossref","unstructured":"Du, R., Chang, D., Hospedales, T., Song, Y.Z., Ma, Z.: DemoFusion: Democratising high-resolution image generation with no \\$\\$\\$. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00589"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Gao, S., Zhou, P., Cheng, M.M., Yan, S.: Masked diffusion transformer is a strong image synthesizer. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.02117"},{"key":"5_CR13","unstructured":"Ge, C., et al.: Advancing vision transformers with group-mix attention. In: arXiv (2023)"},{"key":"5_CR14","unstructured":"von Glehn, I., Spencer, J.S., Pfau, D.: A self-attention ansatz for ab-initio quantum chemistry. In: ICLR (2023)"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Hatamizadeh, A., Song, J., Liu, G., Kautz, J., Vahdat, A.: DiffiT: Diffusion vision transformers for image generation. In: arXiv (2023)","DOI":"10.1007\/978-3-031-73242-3_3"},{"key":"5_CR16","unstructured":"He, Y., et al.: ScaleCrafter: tuning-free higher-resolution visual generation with diffusion models. In: ICLR (2024)"},{"key":"5_CR17","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: NeurIPS (2017)"},{"key":"5_CR18","doi-asserted-by":"crossref","unstructured":"Kang, M., et al.: Scaling up GANs for text-to-image synthesis. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"5_CR19","unstructured":"Li, D., Kamko, A., Akhgari, E., Sabet, A., Xu, L., Doshi, S.: Playground v2.5: Three insights towards enhancing aesthetic quality in text-to-image generation. In: arXiv (2024)"},{"key":"5_CR20","unstructured":"Li, D., Kamko, A., Sabet, A., Akhgari, E., Xu, L., Doshi, S.: Playground v2. https:\/\/huggingface.co\/playgroundai\/playground-v2-1024px-aesthetic"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"5_CR22","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: arXiv (2023)"},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"5_CR24","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: arXiv (2017)"},{"key":"5_CR25","unstructured":"Lu, J., et al.: SOFT: softmax-free transformer with linear complexity. In: NeurIPS (2021)"},{"key":"5_CR26","unstructured":"Lu, Z., et al.: FiT: Flexible vision transformer for diffusion model. In: arXiv (2024)"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Luo, Y., Ren, X., Zheng, Z., Jiang, Z., Jiang, X., You, Y.: CAME: Confidence-guided adaptive memory efficient optimization. In: ACL (2023)","DOI":"10.18653\/v1\/2023.acl-long.243"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Ma, N., Goldstein, M., Albergo, M.S., Boffi, N.M., Vanden-Eijnden, E., Xie, S.: SiT: Exploring flow and diffusion-based generative models with scalable interpolant transformers. In: arXiv (2024)","DOI":"10.1007\/978-3-031-72980-5_2"},{"key":"5_CR29","unstructured":"Midjourney: Midjourney (2023). https:\/\/www.midjourney.com"},{"key":"5_CR30","unstructured":"OpenAI: Dalle-2 (2023). https:\/\/openai.com\/dall-e-2"},{"key":"5_CR31","unstructured":"OpenAI: Dalle-3 (2023). https:\/\/openai.com\/dall-e-3"},{"key":"5_CR32","unstructured":"OpenAI: GPT-4V(ision) system card. In: OpenAI (2023)"},{"key":"5_CR33","unstructured":"OpenAI: Sora (2024). https:\/\/openai.com\/sora"},{"key":"5_CR34","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"5_CR35","unstructured":"Pernias, P., Rampas, D., Richter, M.L., Pal, C., Aubreville, M.: W\u00fcrstchen: an efficient architecture for large-scale text-to-image diffusion models. In: ICLR (2023)"},{"key":"5_CR36","unstructured":"Podell, D., et al.: SDXL: Improving latent diffusion models for high-resolution image synthesis. In: arXiv (2023)"},{"key":"5_CR37","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training. OpenAI blog (2018)"},{"key":"5_CR38","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et\u00a0al.: Language models are unsupervised multitask learners. OpenAI blog (2019)"},{"key":"5_CR39","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"5_CR40","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. In: NeurIPS (2022)"},{"key":"5_CR41","doi-asserted-by":"crossref","unstructured":"Sauer, A., Lorenz, D., Blattmann, A., Rombach, R.: Adversarial diffusion distillation. In: arXiv (2023)","DOI":"10.1007\/978-3-031-73016-0_6"},{"key":"5_CR42","unstructured":"Stability.AI: Stable diffusion 3 (2024). https:\/\/stability.ai\/news\/stable-diffusion-3"},{"key":"5_CR43","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: ICML (2021)"},{"key":"5_CR44","unstructured":"Wang, S., Li, B.Z., Khabsa, M., Fang, H., Ma, H.: Linformer: Self-attention with linear complexity. In: arXiv (2020)"},{"key":"5_CR45","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Pyramid vision transformer: a versatile backbone for dense prediction without convolutions. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"5_CR46","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: PVT v2: improved baselines with pyramid vision transformer. Comput. Visual Media 8(3), 415\u2013424 (2022)","DOI":"10.1007\/s41095-022-0274-8"},{"key":"5_CR47","doi-asserted-by":"crossref","unstructured":"Xia, Z., Pan, X., Song, S., Li, L.E., Huang, G.: Vision transformer with deformable attention. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00475"},{"key":"5_CR48","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: SegFormer: simple and efficient design for semantic segmentation with transformers. In: NeurIPS (2021)"},{"key":"5_CR49","doi-asserted-by":"crossref","unstructured":"Xie, E., et al.: DiffFit: unlocking transferability of large diffusion models via simple parameter-efficient fine-tuning. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00390"},{"key":"5_CR50","unstructured":"Xue, Z., et al.: RAPHAEL: text-to-image generation via large mixture of diffusion paths. In: NeurIPS (2023)"},{"key":"5_CR51","doi-asserted-by":"crossref","unstructured":"Yuan, L., et al.: Tokens-to-token ViT: training vision transformers from scratch on ImageNet. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"5_CR52","unstructured":"Zheng, H., Nie, W., Vahdat, A., Anandkumar, A.: Fast training of diffusion models with masked transformers. In: arXiv (2023)"},{"key":"5_CR53","doi-asserted-by":"crossref","unstructured":"Zheng, S., et\u00a0al.: Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"5_CR54","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: ICLR (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73411-3_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:33:56Z","timestamp":1733088836000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73411-3_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,23]]},"ISBN":["9783031734106","9783031734113"],"references-count":54,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73411-3_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,23]]},"assertion":[{"value":"23 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}