{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:21:30Z","timestamp":1777890090579,"version":"3.51.4"},"reference-count":67,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01433","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"15447-15457","source":"Crossref","is-referenced-by-count":0,"title":["PUMA: Empowering Unified MLLM with Multi-Granular Visual Generation"],"prefix":"10.1109","author":[{"given":"Rongyao","family":"Fang","sequence":"first","affiliation":[{"name":"CUHK MMLab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengqi","family":"Duan","sequence":"additional","affiliation":[{"name":"HKU"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kun","family":"Wang","sequence":"additional","affiliation":[{"name":"SenseTime"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Li","sequence":"additional","affiliation":[{"name":"CUHK MMLab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Linjiang","family":"Huang","sequence":"additional","affiliation":[{"name":"BUAA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Tian","sequence":"additional","affiliation":[{"name":"SenseTime"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingyu","family":"Zeng","sequence":"additional","affiliation":[{"name":"SenseTime"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Zhao","sequence":"additional","affiliation":[{"name":"SenseTime"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jifeng","family":"Dai","sequence":"additional","affiliation":[{"name":"Shanghai AI Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[{"name":"CUHK MMLab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xihui","family":"Liu","sequence":"additional","affiliation":[{"name":"HKU"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref2","author":"Bai","year":"2023","journal-title":"Qwen-vl: A frontier large vision-language model with versatile abilities"},{"issue":"3","key":"ref3","first-page":"8","article-title":"Improving image generation with better captions","volume":"2","author":"Betker","year":"2023","journal-title":"Computer Science"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"ref5","volume-title":"Laion: Image data, ai, and dispossession","author":"Burger","year":"2023"},{"key":"ref6","author":"Cai","year":"2024","journal-title":"Matryoshka multimodal models"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref9","volume-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref11","author":"Dong","year":"2023","journal-title":"Dreamllm: Synergistic multimodal comprehension and creation"},{"key":"ref12","article-title":"Scaling rectified flow transformers for high-resolution image synthesis","volume-title":"Forty-first international conference on machine learning","author":"Esser","year":"2024"},{"key":"ref13","volume-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models","author":"Fu","year":"2024"},{"key":"ref14","author":"Gao","year":"2020","journal-title":"The pile: An 800gb dataset of diverse text for language modeling"},{"key":"ref15","author":"Ge","year":"2023","journal-title":"Making llama see and draw with seed tokenizer"},{"key":"ref16","author":"Ge","year":"2024","journal-title":"Seed-data-edit technical report: A hybrid dataset for instructional image editing"},{"key":"ref17","author":"Ge","year":"2024","journal-title":"Seed-x: Multimodal models with unified multi-granularity comprehension and generation"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2270"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00380"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref21","author":"Kingma","year":"2013","journal-title":"Auto-encoding variational bayes"},{"key":"ref22","article-title":"Generating images with multimodal language models","volume":"36","author":"Koh","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref23","volume-title":"Flux","year":"2024"},{"key":"ref24","author":"Li","year":"2024","journal-title":"Llava-onevision: Easy visual task transfer"},{"key":"ref25","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning","author":"Li","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3637265"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref31","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in neural information processing systems"},{"key":"ref32","author":"Liu","year":"2024","journal-title":"World model on million-length video and language with ringattention"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3573023"},{"key":"ref34","author":"Loshchilov","year":"2017","journal-title":"Decoupled weight decay regularization"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"ref37","author":"Peng","year":"2023","journal-title":"Kosmos-2: Ground ing multimodal large language models to the world"},{"key":"ref38","author":"Podell","year":"2023","journal-title":"Sdxl: Improving latent diffusion mod els for high-resolution image synthesis"},{"key":"ref39","author":"Qin","year":"2023","journal-title":"Unicontrol: A unified diffusion model for controllable visual generation in the wild"},{"key":"ref40","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"issue":"2","key":"ref41","first-page":"3","volume":"1","author":"Ramesh","year":"2022","journal-title":"Hierarchical text-conditional image gener ation with clip latents"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1833"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00847"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2161"},{"key":"ref46","author":"Sun","year":"2024","journal-title":"Autoregressive model beats diffusion: Llama for scalable image generation"},{"key":"ref47","author":"Sun","year":"2023","journal-title":"Generative pretraining in multimodality"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0707"},{"key":"ref50","year":"2024","journal-title":"Chameleon: Mixed-modal early-fusion foundation models"},{"key":"ref51","author":"Tong","year":"2024","journal-title":"Cambrian- 1: A fully open, vision-centric exploration of multimodal llms"},{"key":"ref52","author":"Touvron","year":"2023","journal-title":"Llama: Open and efficient foundation language models"},{"key":"ref53","author":"Wang","year":"2024","journal-title":"Emu3: Next-token prediction is all you need"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01210"},{"key":"ref55","author":"Wu","year":"2023","journal-title":"Next-gpt: Any-to-any multimodal llm"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01241"},{"key":"ref57","author":"Xie","year":"2024","journal-title":"Show-o: One single transformer to unify multimodal understanding and generation"},{"key":"ref58","author":"Ye","year":"2024","journal-title":"X-vila: Cross-modality alignment for large language model"},{"key":"ref59","first-page":"13040","article-title":"mplug-owl2: Revolutionizing multi-modal large language model with modality collaboration","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Ye","year":"2024"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1365"},{"key":"ref61","author":"Zhang","year":"2024","journal-title":"Internlm-xcomposer-2.5: A versatile large vision language model supporting long-contextual in put and output"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref63","author":"Zhang","year":"2023","journal-title":"Llama-adapter: Efficient fine-tuning of language models with zero-init attention"},{"key":"ref64","author":"Zhang","year":"2023","journal-title":"Llavar: Enhanced visual instruction tuning for text-rich image understanding"},{"key":"ref65","volume-title":"Transfusion: Predict the next token and diffuse images with one multi-modal model","author":"Zhou","year":"2024"},{"key":"ref66","author":"Zhu","year":"2023","journal-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models"},{"key":"ref67","author":"Zhu","year":"2023","journal-title":"Vl-gpt: A generative pre-trained transformer for vision and language understanding and generation"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444236.pdf?arnumber=11444236","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:29:26Z","timestamp":1777613366000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444236\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":67,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01433","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}