{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T16:19:33Z","timestamp":1778084373687,"version":"3.51.4"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730092","type":"print"},{"value":"9783031730108","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73010-8_9","type":"book-chapter","created":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T13:10:53Z","timestamp":1731157853000},"page":"141-156","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Inf-DiT: Upsampling Any-Resolution Image with\u00a0Memory-Efficient Diffusion Transformer"],"prefix":"10.1007","author":[{"given":"Zhuoyi","family":"Yang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Heyang","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenyi","family":"Hong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiayan","family":"Teng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wendi","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuxiao","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ming","family":"Ding","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,10]]},"reference":[{"key":"9_CR1","doi-asserted-by":"crossref","unstructured":"Agustsson, E., Timofte, R.: NTIRE 2017 challenge on single image super-resolution: dataset and study. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 126\u2013135 (2017)","DOI":"10.1109\/CVPRW.2017.150"},{"key":"9_CR2","unstructured":"Bar-Tal, O., Yariv, L., Lipman, Y., Dekel, T.: MultiDiffusion: fusing diffusion paths for controlled image generation (2023)"},{"key":"9_CR3","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"170","DOI":"10.1007\/978-3-031-19787-1_10","volume-title":"ECCV 2022","author":"L Chai","year":"2022","unstructured":"Chai, L., Gharbi, M., Shechtman, E., Isola, P., Zhang, R.: Any-resolution training for high-resolution image synthesis. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13676, pp. 170\u2013188. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19787-1_10"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Dai, Z., Yang, Z., Yang, Y., Carbonell, J., Le, Q.V., Salakhutdinov, R.: Transformer-XL: attentive language models beyond a fixed-length context. arXiv preprint arXiv:1901.02860 (2019)","DOI":"10.18653\/v1\/P19-1285"},{"key":"9_CR5","unstructured":"Dehghani, M., et\u00a0al.: Scaling vision transformers to 22 billion parameters. In: International Conference on Machine Learning, pp. 7480\u20137512. PMLR (2023)"},{"key":"9_CR6","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth $$16 \\times 16$$ words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Du, R., Chang, D., Hospedales, T., Song, Y.Z., Ma, Z.: DemoFusion: democratising high-resolution image generation with no $$\\$. arXiv preprint arXiv:2311.16973 (2023)","DOI":"10.1109\/CVPR52733.2024.00589"},{"key":"9_CR8","unstructured":"Guttenberg, N.: Diffusion with offset noise (2023). https:\/\/www.crosslabs.org\/blog\/diffusion-with-offset-noise"},{"key":"9_CR9","unstructured":"He, Y., et al.: Scalecrafter: tuning-free higher-resolution visual generation with diffusion models. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"9_CR10","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"9_CR11","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"issue":"1","key":"9_CR12","first-page":"2249","volume":"23","author":"J Ho","year":"2022","unstructured":"Ho, J., Saharia, C., Chan, W., Fleet, D.J., Norouzi, M., Salimans, T.: Cascaded diffusion models for high fidelity image generation. J. Mach. Learn. Res. 23(1), 2249\u20132281 (2022)","journal-title":"J. Mach. Learn. Res."},{"key":"9_CR13","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: Openclip (2021). https:\/\/doi.org\/10.5281\/zenodo.5143773, if you use this software, please cite it as below","DOI":"10.5281\/zenodo.5143773"},{"key":"9_CR14","unstructured":"Jim\u00e9nez, \u00c1.B.: Mixture of diffusers for scene composition and high resolution image generation. arXiv preprint arXiv:2302.02412 (2023)"},{"key":"9_CR15","unstructured":"Karras, T., Aittala, M., Aila, T., Laine, S.: Elucidating the design space of diffusion-based generative models. In: Advances in Neural Information Processing Systems, vol. 35, pp. 26565\u201326577 (2022)"},{"key":"9_CR16","doi-asserted-by":"crossref","unstructured":"Lin, X., et al.: DiffBIR: towards blind image restoration with generative diffusion prior. arXiv preprint arXiv:2308.15070 (2023)","DOI":"10.1007\/978-3-031-73202-7_25"},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"9_CR18","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"9_CR19","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"9_CR20","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, 1(2), 3 (2022)"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"9_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"9_CR23","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Advances in Neural Information Processing Systems, vol. 35, pp. 36479\u201336494 (2022)"},{"issue":"4","key":"9_CR24","first-page":"4713","volume":"45","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Ho, J., Chan, W., Salimans, T., Fleet, D.J., Norouzi, M.: Image super-resolution via iterative refinement. IEEE Trans. Pattern Anal. Mach. Intell. 45(4), 4713\u20134726 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"9_CR25","unstructured":"Schuhmann, C., et al.: LAION-5b: an open large-scale dataset for training next generation image-text models. In: Thirty-Sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2022). https:\/\/openreview.net\/forum?id=M3Y74vmsMcY"},{"key":"9_CR26","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"9_CR27","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"9_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063","volume":"568","author":"J Su","year":"2024","unstructured":"Su, J., Ahmed, M., Lu, Y., Pan, S., Bo, W., Liu, Y.: Roformer: enhanced transformer with rotary position embedding. Neurocomputing 568, 127063 (2024)","journal-title":"Neurocomputing"},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"Wang, J., Yue, Z., Zhou, S., Chan, K.C., Loy, C.C.: Exploiting diffusion prior for real-world image super-resolution. arXiv preprint arXiv:2305.07015 (2023)","DOI":"10.1007\/s11263-024-02168-7"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Wang, X., Xie, L., Dong, C., Shan, Y.: Real-ESRGAN: training real-world blind super-resolution with pure synthetic data. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1905\u20131914 (2021)","DOI":"10.1109\/ICCVW54120.2021.00217"},{"key":"9_CR31","unstructured":"Wu, X., et al.: Human preference score v2: a solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341 (2023)"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Yang, T., Ren, P., Xie, X., Zhang, L.: Pixel-aware stable diffusion for realistic image super-resolution and personalized stylization. arXiv preprint arXiv:2308.14469 (2023)","DOI":"10.1007\/978-3-031-73247-8_5"},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Zhang, K., Liang, J., Van\u00a0Gool, L., Timofte, R.: Designing a practical degradation model for deep blind image super-resolution. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4791\u20134800 (2021)","DOI":"10.1109\/ICCV48922.2021.00475"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73010-8_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T02:25:00Z","timestamp":1733019900000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73010-8_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,10]]},"ISBN":["9783031730092","9783031730108"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73010-8_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,10]]},"assertion":[{"value":"10 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}