{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T23:03:11Z","timestamp":1772751791913,"version":"3.50.1"},"reference-count":85,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T00:00:00Z","timestamp":1772668800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T00:00:00Z","timestamp":1772668800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s11263-026-02795-2","type":"journal-article","created":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T10:00:03Z","timestamp":1772704803000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Lumina-mGPT: Flexible Photorealistic Autoregressive Text-to-Image Generation"],"prefix":"10.1007","volume":"134","author":[{"given":"Dongyang","family":"Liu","sequence":"first","affiliation":[]},{"given":"Yi","family":"Xin","sequence":"additional","affiliation":[]},{"given":"Shitian","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Le","family":"Zhuo","sequence":"additional","affiliation":[]},{"given":"Weifeng","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Xinyue","family":"Li","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Qin","sequence":"additional","affiliation":[]},{"given":"Guangtao","family":"Zhai","sequence":"additional","affiliation":[]},{"given":"Xiaohong","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Peng","family":"Gao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,3,5]]},"reference":[{"key":"2795_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et al. (2023). Gpt-4 technical report. arXiv:2303.08774."},{"key":"2795_CR2","unstructured":"Bai, J., Bai, S., Chu, Y., Cui, Z., Dang, K., Deng, X., Fan, Y., Ge, W., Han, Y., Huang, F., et al. (2023). Qwen technical report. arXiv:2309.16609."},{"key":"2795_CR3","doi-asserted-by":"crossref","unstructured":"Bansal, A., Russell, B., & Gupta, A. (2016). Marr revisited: 2d\u20133d alignment via surface normal prediction. In Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.642"},{"issue":"3","key":"2795_CR4","first-page":"8","volume":"2","author":"J Betker","year":"2023","unstructured":"Betker, J., Goh, G., Jing, L., Brooks, T., Wang, J., Li, L., Ouyang, L., Zhuang, J., Lee, J., Guo, Y., et al. (2023). Improving image generation with better captions. Computer Science, 2(3), 8.","journal-title":"Computer Science"},{"key":"2795_CR5","unstructured":"Bi, X., Chen, D., Chen, G., Chen, S., Dai, D., Deng, C., Ding, H., Dong, K., Du, Q., Fu, Z., et al. (2024). Deepseek llm: Scaling open-source language models with longtermism. arXiv:2401.02954."},{"key":"2795_CR6","unstructured":"Brooks, T., Peebles, B., Holmes, C., DePue, W., Guo, Y., Jing, L., Schnurr, D., Taylor, J., Luhman, T., Luhman, E., Ng, C., Wang, R., & Ramesh, A. (2024). Video generation models as world simulators. https:\/\/openai.com\/research\/video-generation-models-as-world-simulators."},{"key":"2795_CR7","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. (2020). Language models are few-shot learners. Advances in Neural Information Processing Systems, 33, 1877\u20131901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2795_CR8","doi-asserted-by":"crossref","unstructured":"Butler, D.J., Wulff, J., Stanley, G.B., Black, M.J. (2012). A naturalistic open source movie for optical flow evaluation. In Computer Vision-ECCV 2012: 12th european conference on computer vision, Florence, Italy, October 7\u201313, 2012, Proceedings, Part VI 12, Springer, pp. 611\u2013625.","DOI":"10.1007\/978-3-642-33783-3_44"},{"key":"2795_CR9","unstructured":"Cabon, Y., Murray, N., & Humenberger, M. (2020). Virtual kitti 2. arXiv:2001.10773."},{"key":"2795_CR10","unstructured":"Chang, H., Zhang, H., Barber, J., Maschinot, A., Lezama, J., Jiang, L., Yang, M.H., Murphy, K.P., Freeman, W.T., Rubinstein, M., et al. (2023). Muse: Text-to-image generation via masked generative transformers. In International conference on machine learning, PMLR, pp 4055\u20134075."},{"key":"2795_CR11","doi-asserted-by":"crossref","unstructured":"Chen, J., Ge, C., Xie, E., Wu, Y., Yao, L., Ren, X., Wang, Z., Luo, P., Lu, H., & Li, Z. (2024). Pixart-$$\\backslash $$sigma: Weak-to-strong training of diffusion transformer for 4k text-to-image generation. arXiv:2403.04692.","DOI":"10.1007\/978-3-031-73411-3_5"},{"key":"2795_CR12","unstructured":"Chen, J., Yu, J., Ge, C., Yao, L., Xie, E., Wang, Z., Kwok, J., Luo, P., Lu, H., & Li, Z. (2024). Pixart-\\$ $$\\alpha $$\\$: Fast training of diffusion transformer for photorealistic text-to-image synthesis. In The twelfth international conference on learning representations, https:\/\/openreview.net\/forum?id=eAKmQPe3m1."},{"key":"2795_CR13","unstructured":"Chowdhery, A., Narang, S., Devlin, J., Bosma, M., Mishra, G., Roberts, A., Barham, P., Chung, H.W., Sutton, C., Gehrmann, S., Schuh, P., Shi, K., Tsvyashchenko, S., Maynez, J., Rao, A., Barnes, P., Tay, Y., Shazeer, N., Prabhakaran, V., Reif, E., Du, N., Hutchinson, B., Pope, R., Bradbury, J., Austin, J., Isard, M., Gur-Ari, G., Yin, P., Duke, T., Levskaya, A., Ghemawat, S., Dev, S., Michalewski, H., Garcia, X., Misra, V., Robinson, K., Fedus, L., Zhou, D., Ippolito, D., Luan, D., Lim, H., Zoph, B., Spiridonov, A., Sepassi, R., Dohan, D., Agrawal, S., Omernick, M., Dai, A.M., Pillai, T.S., Pellat, M., Lewkowycz, A., Moreira, E., Child, R., Polozov, O., Lee, K., Zhou, Z., Wang, X., Saeta, B., Diaz, M., Firat, O., Catasta, M., Wei, J., Meier-Hellstern, K., Eck, D., Dean, J., Petrov, S., & Fiedel, N. (2022). PaLM: Scaling language modeling with pathways. arXiv:2204.02311."},{"key":"2795_CR14","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., & Nie\u00dfner, M. (2017). Scannet: Richly-annotated 3d reconstructions of indoor scenes. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 5828\u20135839.","DOI":"10.1109\/CVPR.2017.261"},{"key":"2795_CR15","unstructured":"Dao, T. (2024). FlashAttention-2: Faster attention with better parallelism and work partitioning. In International conference on learning representations (ICLR)."},{"key":"2795_CR16","doi-asserted-by":"crossref","unstructured":"Dao, T., Fu, D.Y., Ermon, S., Rudra, A., & R\u00e9, C. (2022). FlashAttention: Fast and memory-efficient exact attention with IO-awareness. In Advances in neural information processing systems (NeurIPS).","DOI":"10.52202\/068431-1189"},{"key":"2795_CR17","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition, IEEE, pp. 248\u2013255.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2795_CR18","first-page":"19822","volume":"34","author":"M Ding","year":"2021","unstructured":"Ding, M., Yang, Z., Hong, W., Zheng, W., Zhou, C., Yin, D., Lin, J., Zou, X., Shao, Z., Yang, H., et al. (2021). Cogview: Mastering text-to-image generation via transformers. Advances in Neural Information Processing Systems, 34, 19822\u201319835.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2795_CR19","unstructured":"Dong, R., Han, C., Peng, Y., Qi, Z., Ge, Z., Yang, J., Zhao, L., Sun, J., Zhou, H., Wei, H., Kong, X., Zhang, X., Ma, K., & Yi, L. (2024). DreamLLM: Synergistic multimodal comprehension and creation. In The twelfth international conference on learning representations, https:\/\/openreview.net\/forum?id=y01KGvd9Bw."},{"key":"2795_CR20","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., & Ommer, B. (2021). Taming transformers for high-resolution image synthesis. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 12873\u201312883.","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"2795_CR21","unstructured":"Esser, P., Kulal, S., Blattmann, A., Entezari, R., M\u00fcller, J., Saini, H., Levi, Y., Lorenz, D., Sauer, A., Boesel, F., et al. (2024). Scaling rectified flow transformers for high-resolution image synthesis. In Forty-first international conference on machine learning."},{"key":"2795_CR22","unstructured":"Gao, P., Zhuo, L., Lin, Z., Liu, C., Chen, J., Du, R., Xie, E., Luo, X., Qiu, L., Zhang, Y., et al. (2024). Lumina-t2x: Transforming text into any modality, resolution, and duration via flow-based large diffusion transformers. arXiv:2405.05945."},{"key":"2795_CR23","unstructured":"Ge, Y., Zhao, S., Li, C., Ge, Y., & Shan, Y. (2024). Seed-data-edit technical report: A hybrid dataset for instructional image editing. arXiv:2405.04007."},{"key":"2795_CR24","doi-asserted-by":"crossref","unstructured":"Ghosh, D., Hajishirzi, H., & Schmidt, L. (2023). Geneval: An object-focused framework for evaluating text-to-image alignment. In: Advances in Neural Information Processing Systems, pp 52132\u201352152.","DOI":"10.52202\/075280-2270"},{"key":"2795_CR25","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Identity mappings in deep residual networks. In: Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14, Springer, pp 630\u2013645.","DOI":"10.1007\/978-3-319-46493-0_38"},{"key":"2795_CR26","unstructured":"Ho, J., & Salimans, T. (2022). Classifier-free diffusion guidance. arXiv:2207.12598."},{"key":"2795_CR27","unstructured":"Hu, X., Wang, R., Fang, Y., Fu, B., Cheng, P., & Yu, G. (2024). Ella: Equip diffusion models with llm for enhanced semantic alignment. arXiv: 2403.05135."},{"key":"2795_CR28","doi-asserted-by":"crossref","unstructured":"Jain, J., Li, J., Chiu, M.T., Hassani, A., Orlov, N., & Shi, H. (2023). Oneformer: One transformer to rule universal image segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 2989\u20132998.","DOI":"10.1109\/CVPR52729.2023.00292"},{"key":"2795_CR29","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., & Berg, T. (2014). Referitgame: Referring to objects in photographs of natural scenes. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp. 787\u2013798.","DOI":"10.3115\/v1\/D14-1086"},{"key":"2795_CR30","first-page":"21487","volume":"36","author":"JY Koh","year":"2024","unstructured":"Koh, J. Y., Fried, D., & Salakhutdinov, R. R. (2024). Generating images with multimodal language models. Advances in Neural Information Processing Systems, 36, 21487.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2795_CR31","doi-asserted-by":"crossref","unstructured":"Kwon, W., Li, Z., Zhuang, S., Sheng, Y., Zheng, L., Yu CH, Gonzalez, J.E., Zhang, H., & Stoica, I. (2023). Efficient memory management for large language model serving with pagedattention. In Proceedings of the ACM SIGOPS 29th symposium on operating systems principles.","DOI":"10.1145\/3600006.3613165"},{"key":"2795_CR32","unstructured":"Leviathan, Yaniv and Kalman, Matan and Matias, Yossi. Fast inference from transformers via speculative decodin. PMLR. International Conference on Machine Learning. 19274\u201319286 (2023)"},{"key":"2795_CR33","first-page":"56424","volume":"37","author":"T Li","year":"2024","unstructured":"Li, T., Tian, Y., Li, H., Deng, M., & He, K. (2024). Advances in eural Information Processing Systems. Autoregressive image generation without vector quantization, 37, 56424\u201356445.","journal-title":"Autoregressive image generation without vector quantization"},{"key":"2795_CR34","unstructured":"Li, Y., Zhang, Y., Wang, C., Zhong, Z., Chen, Y., Chu, R., Liu, S., & Jia, J. (2024). Mini-gemini: Mining the potential of multi-modality vision language models. arXiv:2403.18814."},{"key":"2795_CR35","unstructured":"Li, Z., Zhang, J., Lin, Q., Xiong, J., Long, Y., Deng, X., Zhang, Y., Liu, X., Huang, M., Xiao, Z., et al. (2024). Hunyuan-dit: A powerful multi-resolution diffusion transformer with fine-grained chinese understanding. arXiv:2405.08748."},{"key":"2795_CR36","doi-asserted-by":"crossref","unstructured":"Lin, B., Zhu, B., Ye, Y., Ning, M., Jin, P., & Yuan, L. (2023). Video-llava: Learning united visual representation by alignment before projection. arXiv:2311.10122.","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"2795_CR37","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.L. (2014). Microsoft coco: Common objects in context. In Computer Vision-ECCV 2014: 13th european conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13, Springer, pp 740\u2013755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2795_CR38","doi-asserted-by":"crossref","unstructured":"Lin, Z., Liu, C., Zhang, R., Gao, P., Qiu, L., Xiao, H., Qiu, H., Lin, C., Shao, W., Chen, K., et al. (2023). Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv:2311.07575.","DOI":"10.1007\/978-3-031-73033-7_3"},{"key":"2795_CR39","unstructured":"Liu, D., Zhao, S., Zhuo, L., Lin, W., Xin, Y., Li, X., Qin, Q., Qiao, Y., Li, H., & Gao, P. (2024). Lumina-mgpt: Illuminate flexible photorealistic text-to-image generation with multimodal generative pretraining. arXiv:2408.02657."},{"key":"2795_CR40","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.J. (2023). Visual instruction tuning. arXiv:2304.08485."},{"key":"2795_CR41","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., & Lee, Y.J. (2024). Improved baselines with visual instruction tuning. In Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp. 26296\u201326306.","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"2795_CR42","unstructured":"Loshchilov, I., & Hutter, F. (2017). Decoupled weight decay regularization. arXiv:1711.05101."},{"key":"2795_CR43","unstructured":"Lu, J., Batra, D., Parikh, D., & Lee, S. (2019). Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32."},{"key":"2795_CR44","unstructured":"Lu, J., Clark, C., Zellers, R., Mottaghi, R., & Kembhavi, A. (2022). Unified-io: A unified model for vision, language, and multi-modal tasks. In: The eleventh international conference on learning representations."},{"key":"2795_CR45","doi-asserted-by":"crossref","unstructured":"Lu, J., Clark, C., Lee, S., Zhang, Z., Khosla, S., Marten, R., Hoiem, D., & Kembhavi, A. (2024). Unified-io 2: Scaling autoregressive multimodal models with vision language audio and action. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 26439\u201326455.","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"2795_CR46","unstructured":"Lu, Z., Wang, Z., Huang, D., Wu, C., Liu, X., Ouyang, W., & Bai, L. (2024). Fit: Flexible vision transformer for diffusion model. In Forty-first international conference on machine learning, https:\/\/openreview.net\/forum?id=jZVen2JguY."},{"key":"2795_CR47","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., & Khan, F.S. (2023). Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv:2306.05424.","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"2795_CR48","doi-asserted-by":"crossref","unstructured":"Mou, C., Wang, X., Xie, L., Wu, Y., Zhang, J., Qi, Z., & Shan, Y. (2024). T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. In: Proceedings of the AAAI conference on artificial intelligence, pp. 4296\u20134304.","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"2795_CR49","unstructured":"Pernias, P., Rampas, D., Richter, M.L., Pal, C., & Aubreville, M. (2024). W\u00fcrstchen: An efficient architecture for large-scale text-to-image diffusion models. In The twelfth international conference on learning representations, https:\/\/openreview.net\/forum?id=gU58d5QeGv."},{"key":"2795_CR50","unstructured":"Podell, D., English, Z., Lacey, K., Blattmann, A., Dockhorn, T., M\u00fcller, J., Penna, J., & Rombach, R. (2023). Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv:2307.01952."},{"key":"2795_CR51","doi-asserted-by":"crossref","unstructured":"Qi, X., Liao, R., Liu, Z., Urtasun, R., & Jia, J. (2018). Geonet: Geometric neural network for joint depth and surface normal estimation. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 283\u2013291.","DOI":"10.1109\/CVPR.2018.00037"},{"issue":"2","key":"2795_CR52","doi-asserted-by":"publisher","first-page":"969","DOI":"10.1109\/TPAMI.2020.3020800","volume":"44","author":"X Qi","year":"2020","unstructured":"Qi, X., Liu, Z., Liao, R., Torr, P. H., Urtasun, R., & Jia, J. (2020). Geonet++: Iterative geometric neural network with edge-aware refinement for joint depth and surface normal estimation. IEEE Transactions on Pattern Analysis and Machine Intelligence, 44(2), 969\u2013984.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2795_CR53","unstructured":"Qin, C., Zhang, S., Yu, N., Feng, Y., Yang, X., Zhou, Y., Wang, H., Niebles, J.C., Xiong, C., Savarese, S., et al. (2023). Unicontrol: A unified diffusion model for controllable visual generation in the wild. arXiv:2305.11147."},{"key":"2795_CR54","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et al. (2018). Improving language understanding by generative pre-training."},{"key":"2795_CR55","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, PMLR, pp. 8748\u20138763."},{"issue":"140","key":"2795_CR56","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., & Liu, P. J. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, 21(140), 1\u201367.","journal-title":"Journal of machine learning research"},{"key":"2795_CR57","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., & Sutskever, I. (2021). Zero-shot text-to-image generation. In International conference on machine learning, Pmlr, pp. 8821\u20138831."},{"key":"2795_CR58","unstructured":"Razavi, A., Van den Oord, A., & Vinyals, O. (2019). Generating diverse high-fidelity images with vq-vae-2. Advances in neural information processing systems 32."},{"key":"2795_CR59","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2795_CR60","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E. L., Ghasemipour, K., Gontijo Lopes, R., Karagol Ayan, B., Salimans, T., et al. (2022). Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, 35, 36479\u201336494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2795_CR61","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., et al. (2022). Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, 35, 25278\u201325294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2795_CR62","doi-asserted-by":"crossref","unstructured":"Silberman, N., Hoiem, D., Kohli, P., & Fergus, R. (2012). Indoor segmentation and support inference from rgbd images. In Computer Vision-ECCV 2012: 12th european conference on computer vision, Florence, Italy, October 7\u201313, 2012, Proceedings, Part V 12, Springer, pp 746\u2013760.","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"2795_CR63","unstructured":"Sun, P., Jiang, Y., Chen, S., Zhang, S., Peng, B., Luo, P., & Yuan, Z. (2024). Autoregressive model beats diffusion: Llama for scalable image generation. arXiv:2406.06525."},{"key":"2795_CR64","unstructured":"Sun, Q., Yu, Q., Cui, Y., Zhang, F., Zhang, X., Wang, Y., Gao, H., Liu, J., Huang, T., & Wang, X. (2023). Emu: Generative pretraining in multimodality. In The twelfth international conference on learning representations."},{"key":"2795_CR65","unstructured":"Team, C. (2024). Chameleon: Mixed-modal early-fusion foundation models. arXiv:2405.09818."},{"key":"2795_CR66","unstructured":"Team, G., Anil, R., Borgeaud, S., Wu, Y., Alayrac JB, Yu, J., Soricut, R., Schalkwyk, J., Dai, A.M., Hauth, A., et al. (2023). Gemini: a family of highly capable multimodal models. arXiv:2312.11805."},{"key":"2795_CR67","unstructured":"Team, K. (2024). Kolors: Effective training of diffusion model for photorealistic text-to-image synthesis. arXiv preprint."},{"key":"2795_CR68","unstructured":"Teknium (2023). Openhermes 2.5: An open dataset of synthetic data for generalist llm assistants. https:\/\/huggingface.co\/datasets\/teknium\/OpenHermes-2.5."},{"key":"2795_CR69","first-page":"84839","volume":"37","author":"K Tian","year":"2024","unstructured":"Tian, K., Jiang, Y., Yuan, Z., Peng, B., & Wang, L. (2024). Visual autoregressive modeling: Scalable image generation via next-scale prediction. Advances in Neural Information Processing Systems, 37, 84839\u201384865.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2795_CR70","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et al. (2023). Llama: Open and efficient foundation language models. arXiv:2302.13971."},{"key":"2795_CR71","unstructured":"Van Den Oord, A., Vinyals, O., et al. (2017). Neural discrete representation learning. Advances in Neural Information Processing Systems 30."},{"key":"2795_CR72","unstructured":"Wang, P., Shen, X., Russell, B., Cohen, S., Price, B., & Yuille, A.L. (2016). Surge: Surface regularized geometry estimation from a single image. Advances in Neural Information Processing Systems 29."},{"key":"2795_CR73","unstructured":"Wang, P., Yang, A., Men, R., Lin, J., Bai, S., Li, Z., Ma, J., Zhou, C., Zhou, J., & Yang, H. (2022). Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International conference on machine learning, PMLR, pp. 23318\u201323340."},{"key":"2795_CR74","doi-asserted-by":"crossref","unstructured":"Wang, X., Fouhey, D., & Gupta, A. (2015). Designing deep networks for surface normal estimation. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 539\u2013547.","DOI":"10.1109\/CVPR.2015.7298652"},{"key":"2795_CR75","unstructured":"Wang, X., Zhang, X., Luo, Z., Sun, Q., Cui, Y., Wang, J., Zhang, F., Wang, Y., Li, Z., Yu, Q., et al. (2024). Emu3: Next-token prediction is all you need. arXiv:2409.18869."},{"key":"2795_CR76","doi-asserted-by":"crossref","unstructured":"Wu, C., Chen, X., Wu, Z., Ma, Y., Liu, X., Pan, Z., Liu, W., Xie, Z., Yu, X., Ruan, C., et al. (2025). Janus: Decoupling visual encoding for unified multimodal understanding and generation. In Proceedings of the computer vision and pattern recognition conference, pp. 12966\u201312977.","DOI":"10.1109\/CVPR52734.2025.01210"},{"key":"2795_CR77","unstructured":"Xu, Sihan and Ma, Ziqiao and Chai, Wenhao and Chen, Xuweiyi and Jin, Weiyang and Chai, Joyce and Xie, Saining and Yu, Stella X. Next-Embedding Prediction Makes Strong Vision Learners. (2025). arXiv preprint arXiv:2512.16922"},{"key":"2795_CR78","unstructured":"Yu, J., Xu, Y., Koh, J.Y., Luong, T., Baid, G., Wang, Z., Vasudevan, V., Ku, A., Yang, Y., Ayan, B.K., et al. (2022). Scaling autoregressive models for content-rich text-to-image generation. Transactions on Machine Learning Research."},{"key":"2795_CR79","unstructured":"Yu, L., Shi, B., Pasunuru, R., Muller, B., Golovneva, O., Wang, T., Babu, A., Tang, B., Karrer, B., Sheynin, S., et al. (2023). Scaling autoregressive multi-modal models: Pretraining and instruction tuning. arXiv:2309.02591 2(3)."},{"key":"2795_CR80","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., & Beyer, L. (2023). Sigmoid loss for language image pre-training. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 11975\u201311986.","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"2795_CR81","first-page":"31428","volume":"36","author":"K Zhang","year":"2024","unstructured":"Zhang, K., Mo, L., Chen, W., Sun, H., & Su, Y. (2024). Magicbrush: A manually annotated dataset for instruction-guided image editing. Advances in Neural Information Processing Systems, 36, 31428.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2795_CR82","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., & Agrawala, M. (2023). Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"2795_CR83","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Cui, Z., Xu, C., Yan, Y., Sebe, N., & Yang, J. (2019). Pattern-affinitive propagation across depth, surface normal and semantic segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2019.00423"},{"key":"2795_CR84","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Gu, A., Varma, R., Luo, L., Huang, C.C., Xu, M., Wright, L., Shojanazeri, H., Ott, M., Shleifer, S., et al. (2023). Pytorch fsdp: experiences on scaling fully sharded data parallel. arXiv:2304.11277.","DOI":"10.14778\/3611540.3611569"},{"key":"2795_CR85","unstructured":"Zhuo, L., Du, R., Xiao, H., Li, Y., Liu, D., Huang, R., Liu, W., Zhao, L., Wang, F.Y., Ma, Z., et\u00a0al. (2024). Lumina-next: Making lumina-t2x stronger and faster with next-dit. arXiv:2406.18583."}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02795-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02795-2","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02795-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T22:04:04Z","timestamp":1772748244000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02795-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,5]]},"references-count":85,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["2795"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02795-2","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,5]]},"assertion":[{"value":"27 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 February 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"141"}}