{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T16:18:03Z","timestamp":1771604283675,"version":"3.50.1"},"reference-count":84,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1007\/s11263-025-02639-5","type":"journal-article","created":{"date-parts":[[2026,1,7]],"date-time":"2026-01-07T03:43:51Z","timestamp":1767757431000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Liquid: Language Models are Scalable and Unified Multi-Modal Generators"],"prefix":"10.1007","volume":"134","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3453-7140","authenticated-orcid":false,"given":"Junfeng","family":"Wu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2133-8719","authenticated-orcid":false,"given":"Yi","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2219-2310","authenticated-orcid":false,"given":"Chuofan","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3037-173X","authenticated-orcid":false,"given":"Yuliang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8277-2706","authenticated-orcid":false,"given":"Hengshuang","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zehuan","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Song","family":"Bai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3449-5940","authenticated-orcid":false,"given":"Xiang","family":"Bai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,7]]},"reference":[{"key":"2639_CR1","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J. B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., et al. (2022). Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems, 35, 23716\u201323736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2639_CR2","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., & Zhou, J. (2023) Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966"},{"key":"2639_CR3","doi-asserted-by":"crossref","unstructured":"Bisk, Y., Zellers, R., Gao, J., & Choi, Y., et\u00a0al. (2020) Piqa: Reasoning about physical commonsense in natural language. In: Proceedings of the AAAI conference on artificial intelligence. pp. 7432\u20137439","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"2639_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. (2020). Language models are few-shot learners. Advances in neural information processing systems, 33, 1877\u20131901.","journal-title":"Advances in neural information processing systems"},{"key":"2639_CR5","unstructured":"Chen, G.H., Chen, S., Zhang, R., Chen, J., Wu, X., Zhang, Z., Chen, Z., Li, J., Wan, X., & Wang, B. (2024) Allava: Harnessing gpt4v-synthesized data for a lite vision-language model. arXiv preprint arXiv:2402.11684"},{"key":"2639_CR6","doi-asserted-by":"crossref","unstructured":"Chen, J., Yu, J., Ge, C., Yao, L., Xie, E., Wu, Y., Wang, Z., Kwok, J., Luo, P., & Lu, H., et\u00a0al. (2023) Pixart-$$alpha$$: Fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426","DOI":"10.1007\/978-3-031-73411-3_5"},{"key":"2639_CR7","doi-asserted-by":"crossref","unstructured":"Chen, L., Li, J., Dong, X., Zhang, P., He, C., Wang, J., Zhao, F., & Lin, D. (2023) Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"2639_CR8","unstructured":"Chowdhery, A., Narang, S., Devlin, J., Bosma, M., Mishra, G., Roberts, A., Barham, P., Chung, H.W., Sutton, C., & Gehrmann, S., et\u00a0al. (2022) Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311"},{"key":"2639_CR9","unstructured":"Clark, C., Lee, K., Chang, M.W., Kwiatkowski, T., Collins, M., & Toutanova, K. (2019) Boolq: Exploring the surprising difficulty of natural yes\/no questions. arXiv preprint arXiv:1905.10044"},{"key":"2639_CR10","unstructured":"Clark, P., Cowhey, I., Etzioni, O., Khot, T., Sabharwal, A., Schoenick, C., & Tafjord, O. (2018) Think you have solved question answering? try arc, the ai2 reasoning challenge. arXiv preprint arXiv:1803.05457"},{"key":"2639_CR11","unstructured":"Dai, W., Li, J., Li, D., Tiong, A.M.H., Zhao, J., Wang, W., Li, B., Fung, P., & Hoi, S. (2023) Instructblip: Towards general-purpose vision-language models with instruction tuning. arXiv"},{"key":"2639_CR12","first-page":"19822","volume":"34","author":"M Ding","year":"2021","unstructured":"Ding, M., Yang, Z., Hong, W., Zheng, W., Zhou, C., Yin, D., Lin, J., Zou, X., Shao, Z., Yang, H., et al. (2021). Cogview: Mastering text-to-image generation via transformers. Advances in neural information processing systems, 34, 19822\u201319835.","journal-title":"Advances in neural information processing systems"},{"key":"2639_CR13","unstructured":"Dong, R., Han, C., Peng, Y., Qi, Z., Ge, Z., Yang, J., Zhao, L., Sun, J., Zhou, H., & Wei, H., et\u00a0al. (2023) Dreamllm: Synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499"},{"key":"2639_CR14","unstructured":"Dubey, A., Jauhri, A., Pandey, A., Kadian, A., Al-Dahle, A., Letman, A., Mathur, A., Schelten, A., Yang, A., & Fan, A., et\u00a0al. (2024) The llama 3 herd of models. arXiv preprint arXiv:2407.21783"},{"key":"2639_CR15","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., & Ommer, B.(2021) Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 12873\u201312883","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"2639_CR16","unstructured":"Fu, C., Chen, P., Shen, Y., Qin, Y., Zhang, M., Lin, X., Yang, J., Zheng, X., Li, K., Sun, X., Wu, Y., & Ji, R. (2024) Mme: A comprehensive evaluation benchmark for multimodal large language models"},{"key":"2639_CR17","unstructured":"Ge, Y., Ge, Y., Zeng, Z., Wang, X., & Shan, Y. (2023) Planting a seed of vision in large language model. arXiv preprint arXiv:2307.08041"},{"key":"2639_CR18","unstructured":"Ge, Y., Zhao, S., Zhu, J., Ge, Y., Yi, K., Song, L., Li, C., Ding, X., & Shan, Y. (2024) Seed-x: Multimodal models with unified multi-granularity comprehension and generation. arXiv preprint arXiv:2404.14396"},{"key":"2639_CR19","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., & Parikh, D. (2017) Making the V in VQA matter: Elevating the role of image understanding in Visual Question Answering. In: Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2017.670"},{"key":"2639_CR20","unstructured":"Hendrycks, D., Burns, C., Basart, S., Zou, A., Mazeika, M., Song, D.X., & Steinhardt, J. (2020) Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300"},{"key":"2639_CR21","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., & Hochreiter, S. (2017) Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30"},{"key":"2639_CR22","unstructured":"Ho, J., & Salimans, T. (2022) Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598"},{"key":"2639_CR23","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., & Manning, C.D. (2019) Gqa: A new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 6700\u20136709","DOI":"10.1109\/CVPR.2019.00686"},{"key":"2639_CR24","unstructured":"Hui, B., Yang, J., Cui, Z., Yang, J., Liu, D., Zhang, L., Liu, T., Zhang, J., Yu, B., & Lu, K., et\u00a0al. (2024) Qwen2. 5-coder technical report. arXiv preprint arXiv:2409.12186"},{"key":"2639_CR25","unstructured":"Jin, Y., Xu, K., Chen, L., Liao, C., Tan, J., Chen, B., Lei, C., Liu, A., Song, C., & Lei, X., et\u00a0al. (2023) Unified language-vision pretraining with dynamic discrete visual tokenization. arXiv preprint arXiv:2309.04669"},{"key":"2639_CR26","unstructured":"Lauren\u00e7on, H., van Strien, D., Bekman, S., Tronchon, L., Saulnier, L., Wang, T., Karamcheti, S., Singh, A., Pistilli, G., & Jernite, Y., et\u00a0al. (2023) Introducing idefics: An open reproduction of state-of-the-art visual language model, 2023. URL https:\/\/huggingface co\/blog\/idefics. Accessed pp. 09\u201318"},{"key":"2639_CR27","unstructured":"Li, D., Kamko, A., Akhgari, E., Sabet, A., Xu, L., & Doshi, S. (2024) Playground v2. 5: Three insights towards enhancing aesthetic quality in text-to-image generation. arXiv preprint arXiv:2402.17245"},{"key":"2639_CR28","doi-asserted-by":"crossref","unstructured":"Li, H., Tian, C., Shao, J., Zhu, X., Wang, Z., Zhu, J., Dou, W., Wang, X., Li, H., & Lu, L., et\u00a0al. (2024) Synergen-vl: Towards synergistic image understanding and generation with vision experts and token folding. arXiv preprint arXiv:2412.09604","DOI":"10.1109\/CVPR52734.2025.02771"},{"key":"2639_CR29","unstructured":"Li, J., Fang, A., Smyrnis, G., Ivgi, M., Jordan, M., Gadre, S., Bansal, H., Guha, E., Keh, S., Arora, K., Garg, S., Xin, R., Muennighoff, N., Heckel, R., Mercat, J., Chen, M., Gururangan, S., Wortsman, M., Albalak, A., Bitton, Y., Nezhurina, M., Abbas, A., Hsieh, C.Y., Ghosh, D., Gardner, J., Kilian, M., Zhang, H., Shao, R., Pratt, S., Sanyal, S., Ilharco, G., Daras, G., Marathe, K., Gokaslan, A., Zhang, J., Chandu, K., Nguyen, T., Vasiljevic, I., Kakade, S., Song, S., Sanghavi, S., Faghri, F., Oh, S., Zettlemoyer, L., Lo, K., El-Nouby, A., Pouransari, H., Toshev, A., Wang, S., Groeneveld, D., Soldaini, L., Koh, P.W., Jitsev, J., Kollar, T., Dimakis, A.G., Carmon, Y., Dave, A., Schmidt, L., & Shankar, V. (2024) Datacomp-lm: In search of the next generation of training sets for language models. arXiv preprint arXiv:2406.11794"},{"key":"2639_CR30","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023) Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International conference on machine learning. pp. 19730\u201319742. PMLR"},{"key":"2639_CR31","unstructured":"Li, J., Li, D., Xiong, C., & Hoi, S. (2022) Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International conference on machine learning. pp. 12888\u201312900. PMLR"},{"key":"2639_CR32","unstructured":"Li, R., Allal, L.B., Zi, Y., Muennighoff, N., Kocetkov, D., Mou, C., Marone, M., Akiki, C., Li, J., Chim, J., Liu, Q., Zheltonozhskii, E., Zhuo, T.Y., Wang, T., Dehaene, O., Davaadorj, M., Lamy-Poirier, J., Monteiro, J., Shliazhko, O., Gontier, N., Meade, N., Zebaze, A., Yee, M.H., Umapathi, L.K., Zhu, J., Lipkin, B., Oblokulov, M., Wang, Z., Murthy, R., Stillerman, J., Patel, S.S., Abulkhanov, D., Zocca, M., Dey, M., Zhang, Z., Fahmy, N., Bhattacharyya, U., Yu, W., Singh, S., Luccioni, S., Villegas, P., Kunakov, M., Zhdanov, F., Romero, M., Lee, T., Timor, N., Ding, J., Schlesinger, C., Schoelkopf, H., Ebert, J., Dao, T., Mishra, M., Gu, A., Robinson, J., Anderson, C.J., Dolan-Gavitt, B., Contractor, D., Reddy, S., Fried, D., Bahdanau, D., Jernite, Y., Ferrandis, C.M., Hughes, S., Wolf, T., Guha, A., von Werra, L., & de\u00a0Vries, H. (2023) Starcoder: may the source be with you! arXiv preprint arXiv:2305.06161"},{"key":"2639_CR33","unstructured":"Li, Y., Zhang, Y., Wang, C., Zhong, Z., Chen, Y., Chu, R., Liu, S., & Jia, J.(2024) Mini-gemini: Mining the potential of multi-modality vision language models. arXiv preprint arXiv:2403.18814"},{"key":"2639_CR34","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., & Wen, J.R. (2023) Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"2639_CR35","doi-asserted-by":"crossref","unstructured":"Lin, J., Yin, H., Ping, W., Lu, Y., Molchanov, P., Tao, A., Mao, H., Kautz, J., Shoeybi, M., & Han, S.(2023) Vila: On pre-training for visual language models","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"2639_CR36","doi-asserted-by":"crossref","unstructured":"Lin, Z., Pathak, D., Li, B., Li, J., Xia, X., Neubig, G., Zhang, P., & Ramanan, D. (2024) Evaluating text-to-visual generation with image-to-text generation. arXiv preprint arXiv:2404.01291","DOI":"10.1007\/978-3-031-72673-6_20"},{"key":"2639_CR37","doi-asserted-by":"crossref","unstructured":"Lin, Z., Pathak, D., Li, B., Li, J., Xia, X., Neubig, G., Zhang, P., & Ramanan, D. (2024) Evaluating text-to-visual generation with image-to-text generation. arXiv preprint arXiv:2404.01291","DOI":"10.1007\/978-3-031-72673-6_20"},{"key":"2639_CR38","unstructured":"Liu, H., Yan, W., Zaharia, M., & Abbeel, P. (2024) World model on million-length video and language with ringattention. arXiv preprint arXiv:2402.08268"},{"key":"2639_CR39","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., & Lee, Y.J. (2024) Improved baselines with visual instruction tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 26296\u201326306","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"2639_CR40","unstructured":"Liu, H., Li, C., Li, Y., Li, B., Zhang, Y., Shen, S., & Lee, Y.J. (2024) Llava-next: Improved reasoning, ocr, and world knowledge, https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"2639_CR41","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.J. (2024) Visual instruction tuning. Advances in neural information processing systems 36"},{"key":"2639_CR42","doi-asserted-by":"crossref","unstructured":"Lu, J., Clark, C., Lee, S., Zhang, Z., Khosla, S., Marten, R., Hoiem, D., & Kembhavi, A. (2024) Unified-io 2: Scaling autoregressive multimodal models with vision language audio and action. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 26439\u201326455","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"2639_CR43","unstructured":"Lu, Y., Li, C., Liu, H., Yang, J., Gao, J., & Shen, Y. (2023) An empirical study of scaling instruct-tuned large multimodal models. arXiv preprint arXiv:2309.09958"},{"key":"2639_CR44","unstructured":"Ma, C., Jiang, Y., Wu, J., Yang, J., Yu, X., Yuan, Z., Peng, B., & Qi, X. (2025) Unitok: A unified tokenizer for visual generation and understanding. arXiv preprint arXiv:2502.20321"},{"key":"2639_CR45","doi-asserted-by":"crossref","unstructured":"Mihaylov, T., Clark, P., Khot, T., & Sabharwal, A. (2018) Can a suit of armor conduct electricity? a new dataset for open book question answering. arXiv preprint arXiv:1809.02789","DOI":"10.18653\/v1\/D18-1260"},{"key":"2639_CR46","unstructured":"Niu, Y., Ning, M., Zheng, M., Lin, B., Jin, P., Liao, J., Ning, K., Zhu, B., & Yuan, L. (2025) Wise: A world knowledge-informed semantic evaluation for text-to-image generation. arXiv preprint arXiv:2503.07265"},{"key":"2639_CR47","unstructured":"Pan, J., Sun, K., Ge, Y., Li, H., Duan, H., Wu, X., Zhang, R., Zhou, A., Qin, Z., Wang, Y., Dai, J., Qiao, Y., & Li, H.(2023) Journeydb: A benchmark for generative image understanding"},{"key":"2639_CR48","unstructured":"Pan, K., Tang, S., Li, J., Fan, Z., Chow, W., Yan, S., Chua, T.S., Zhuang, Y., & Zhang, H. (2024) Auto-encoding morph-tokens for multimodal llm. arXiv preprint arXiv:2405.01926"},{"key":"2639_CR49","unstructured":"Podell, D., English, Z., Lacey, K., Blattmann, A., Dockhorn, T., M\u00fcller, J., Penna, J., & Rombach, R. (2023) Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952"},{"key":"2639_CR50","doi-asserted-by":"crossref","unstructured":"Qu, L., Zhang, H., Liu, Y., Wang, X., Jiang, Y., Gao, Y., Ye, H., Du, D.K., Yuan, Z., & Wu, X. (2024) Tokenflow: Unified image tokenizer for multimodal understanding and generation. arXiv preprint arXiv:2412.03069","DOI":"10.1109\/CVPR52734.2025.00243"},{"key":"2639_CR51","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., & Clark, J., et\u00a0al. (2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR"},{"issue":"1","key":"2639_CR52","first-page":"94","volume":"11","author":"AM Radhakrishnan","year":"2023","unstructured":"Radhakrishnan, A. M. (2023). Is midjourney-ai the new anti-hero of architectural imagery & creativity? GSJ, 11(1), 94\u2013104.","journal-title":"GSJ"},{"issue":"1","key":"2639_CR53","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., & Liu, P. J. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. The Journal of Machine Learning Research, 21(1), 5485\u20135551.","journal-title":"The Journal of Machine Learning Research"},{"key":"2639_CR54","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., & Chen, M. (2022) Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3"},{"key":"2639_CR55","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., & Sutskever, I. (2021) Zero-shot text-to-image generation. In: International Conference on Machine Learning. pp. 8821\u20138831. PMLR"},{"key":"2639_CR56","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022) High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 10684\u201310695","DOI":"10.1109\/CVPR52688.2022.01042"},{"issue":"9","key":"2639_CR57","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3474381","volume":"64","author":"K Sakaguchi","year":"2021","unstructured":"Sakaguchi, K., Bras, R. L., Bhagavatula, C., & Choi, Y. (2021). Winogrande: An adversarial winograd schema challenge at scale. Communications of the ACM, 64(9), 99\u2013106.","journal-title":"Communications of the ACM"},{"key":"2639_CR58","doi-asserted-by":"crossref","unstructured":"Sap, M., Rashkin, H., Chen, D., LeBras, R., & Choi, Y. (2019) Socialiqa: Commonsense reasoning about social interactions. arXiv preprint arXiv:1904.09728","DOI":"10.18653\/v1\/D19-1454"},{"key":"2639_CR59","doi-asserted-by":"publisher","unstructured":"Sennrich, R., Haddow, B., & Birch, A. (2016) Neural machine translation of rare words with subword units. In: Erk, K., Smith, N.A. (eds.) Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). pp. 1715\u20131725. Association for Computational Linguistics, Berlin, Germany. https:\/\/doi.org\/10.18653\/v1\/P16-1162","DOI":"10.18653\/v1\/P16-1162"},{"key":"2639_CR60","doi-asserted-by":"crossref","unstructured":"Singh, A., Natarajan, V., Shah, M., Jiang, Y., Chen, X., Batra, D., Parikh, D., & Rohrbach, M. (2019) Towards vqa models that can read. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 8317\u20138326","DOI":"10.1109\/CVPR.2019.00851"},{"key":"2639_CR61","unstructured":"Soboleva, D., Al-Khateeb, F., Myers, R., Steeves, J.R., Hestness, J., & Dey, N. (2023) SlimPajama: A 627B token cleaned and deduplicated version of RedPajama. https:\/\/www.cerebras.net\/blog\/slimpajama-a-627b-token-cleaned-and-deduplicated-version-of-redpajama, https:\/\/huggingface.co\/datasets\/cerebras\/SlimPajama-627B"},{"key":"2639_CR62","unstructured":"Sun, P., Jiang, Y., Chen, S., Zhang, S., Peng, B., Luo, P., & Yuan, Z.(2024) Autoregressive model beats diffusion: Llama for scalable image generation. arXiv preprint arXiv:2406.06525"},{"key":"2639_CR63","doi-asserted-by":"crossref","unstructured":"Sun, Q., Cui, Y., Zhang, X., Zhang, F., Yu, Q., Wang, Y., Rao, Y., Liu, J., Huang, T., & Wang, X.(2024) Generative multimodal models are in-context learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 14398\u201314409","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"2639_CR64","unstructured":"Sun, Q., Yu, Q., Cui, Y., Zhang, F., Zhang, X., Wang, Y., Gao, H., Liu, J., Huang, T., & Wang, X.(2023) Generative pretraining in multimodality. arXiv preprint arXiv:2307.05222"},{"key":"2639_CR65","unstructured":"Team, C. (2024) Chameleon: Mixed-modal early-fusion foundation models. arXiv preprint arXiv:2405.09818"},{"key":"2639_CR66","unstructured":"Team, G., Mesnard, T., Hardin, C., Dadashi, R., Bhupatiraju, S., Pathak, S., Sifre, L., Rivi\u00e8re, M., Kale, M.S., & Love, J., et\u00a0al. (2024) Gemma: Open models based on gemini research and technology. arXiv preprint arXiv:2403.08295"},{"key":"2639_CR67","unstructured":"Team, G., Riviere, M., Pathak, S., Sessa, P.G., Hardin, C., Bhupatiraju, S., Hussenot, L., Mesnard, T., Shahriari, B., & Ram\u00e9, A., et\u00a0al. (2024) Gemma 2: Improving open language models at a practical size. arXiv preprint arXiv:2408.00118"},{"key":"2639_CR68","doi-asserted-by":"crossref","unstructured":"Tian, K., Jiang, Y., Yuan, Z., Peng, B., & Wang, L. (2024) Visual autoregressive modeling: Scalable image generation via next-scale prediction. arXiv preprint arXiv:2404.02905","DOI":"10.52202\/079017-2694"},{"key":"2639_CR69","unstructured":"Tong, S., Fan, D., Zhu, J., Xiong, Y., Chen, X., Sinha, K., Rabbat, M., LeCun, Y., Xie, S., & Liu, Z. (2024) Metamorph: Multimodal understanding and generation via instruction tuning. arXiv preprint arXiv:2412.14164"},{"key":"2639_CR70","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., & Azhar, F., et\u00a0al. (2023) Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971"},{"key":"2639_CR71","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., & Bhosale, S., et\u00a0al. (2023) Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288"},{"key":"2639_CR72","unstructured":"Van Den Oord, A., & Vinyals, O., et\u00a0al. (2017) Neural discrete representation learning. Advances in neural information processing systems 30"},{"key":"2639_CR73","unstructured":"Wang, X., Zhang, X., Luo, Z., Sun, Q., Cui, Y., Wang, J., Zhang, F., Wang, Y., Li, Z., & Yu, Q., et\u00a0al. (2024) Emu3: Next-token prediction is all you need. arXiv preprint arXiv:2409.18869"},{"key":"2639_CR74","unstructured":"Wang, Y., Xiong, T., Zhou, D., Lin, Z., Zhao, Y., Kang, B., Feng, J., & Liu, X. (2024) Loong: Generating minute-level long videos with autoregressive language models. arXiv preprint arXiv:2410.02757"},{"key":"2639_CR75","doi-asserted-by":"crossref","unstructured":"Wu, C., Chen, X., Wu, Z., Ma, Y., Liu, X., Pan, Z., Liu, W., Xie, Z., Yu, X., & Ruan, C., et\u00a0al.(2024) Janus: Decoupling visual encoding for unified multimodal understanding and generation. arXiv preprint arXiv:2410.13848","DOI":"10.1109\/CVPR52734.2025.01210"},{"key":"2639_CR76","unstructured":"Wu, Y., Zhang, Z., Chen, J., Tang, H., Li, D., Fang, Y., Zhu, L., Xie, E., Yin, H., & Yi, L., et\u00a0al. (2024) Vila-u: a unified foundation model integrating visual understanding and generation. arXiv preprint arXiv:2409.04429"},{"key":"2639_CR77","unstructured":"Xie, J., Mao, W., Bai, Z., Zhang, D.J., Wang, W., Lin, K.Q., Gu, Y., Chen, Z., Yang, Z., & Shou, M.Z. (2024) Show-o: One single transformer to unify multimodal understanding and generation. arXiv preprint arXiv:2408.12528"},{"key":"2639_CR78","unstructured":"Yang, A., Xiao, B., Wang, B., Zhang, B., Bian, C., Yin, C., Lv, C., Pan, D., Wang, D., & Yan, D., et\u00a0al. (2023) Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305"},{"key":"2639_CR79","unstructured":"Yu, J., Xu, Y., Koh, J.Y., Luong, T., Baid, G., Wang, Z., Vasudevan, V., Ku, A., Yang, Y., & Ayan, B.K., et\u00a0al. (2022) Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.107892(3), 5"},{"key":"2639_CR80","unstructured":"Yu, L., Shi, B., Pasunuru, R., Muller, B., Golovneva, O., Wang, T., Babu, A., Tang, B., Karrer, B., & Sheynin, S., et\u00a0al. (2023) Scaling autoregressive multi-modal models: Pretraining and instruction tuning. arXiv preprint arXiv:2309.025912(3)"},{"key":"2639_CR81","doi-asserted-by":"crossref","unstructured":"Zellers, R., Holtzman, A., Bisk, Y., Farhadi, A., & Choi, Y.(2019) Hellaswag: Can a machine really finish your sentence? arXiv preprint arXiv:1905.07830","DOI":"10.18653\/v1\/P19-1472"},{"key":"2639_CR82","unstructured":"Zheng, L., Chiang, W.L., Sheng, Y., Li, T., Zhuang, S., Wu, Z., Zhuang, Y., Li, Z., Lin, Z., & Xing, E.P., et\u00a0al. (2023) Lmsys-chat-1m: A large-scale real-world llm conversation dataset. arXiv preprint arXiv:2309.11998"},{"key":"2639_CR83","unstructured":"Zhou, C., Yu, L., Babu, A., Tirumala, K., Yasunaga, M., Shamis, L., Kahn, J., Ma, X., Zettlemoyer, L., & Levy, O. (2024) Transfusion: Predict the next token and diffuse images with one multi-modal model. arXiv preprint arXiv:2408.11039"},{"key":"2639_CR84","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., & Elhoseiny, M. (2023) Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02639-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02639-5","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02639-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T15:43:23Z","timestamp":1771602203000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02639-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1]]},"references-count":84,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,1]]}},"alternative-id":["2639"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02639-5","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1]]},"assertion":[{"value":"13 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 October 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"39"}}