{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T21:13:19Z","timestamp":1775596399460,"version":"3.50.1"},"reference-count":223,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62222209"],"award-info":[{"award-number":["62222209"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017582","name":"Beijing National Research Center for Information Science and Technology","doi-asserted-by":"publisher","award":["BNR2023TD03006"],"award-info":[{"award-number":["BNR2023TD03006"]}],"id":[{"id":"10.13039\/501100017582","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014206","name":"Beijing Key Laboratory of Networked Multimedia","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100014206","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1109\/tcsvt.2025.3635224","type":"journal-article","created":{"date-parts":[[2025,11,20]],"date-time":"2025-11-20T18:44:02Z","timestamp":1763664242000},"page":"5621-5641","source":"Crossref","is-referenced-by-count":1,"title":["Multi-Modal Generative AI: Multi-Modal LLMs, Diffusions, and the Unification"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0351-2939","authenticated-orcid":false,"given":"Xin","family":"Wang","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9582-7331","authenticated-orcid":false,"given":"Yuwei","family":"Zhou","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2504-3689","authenticated-orcid":false,"given":"Bin","family":"Huang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0943-2286","authenticated-orcid":false,"given":"Hong","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2236-9290","authenticated-orcid":false,"given":"Wenwu","family":"Zhu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref2","volume-title":"Video Generation Models as World Simulators","author":"Brooks et al","year":"2024"},{"key":"ref3","article-title":"Chameleon: Mixed-modal early-fusion foundation models","author":"Team","year":"2024","journal-title":"arXiv:2405.09818"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.3390\/info16080688"},{"key":"ref6","article-title":"A survey of large language models","author":"Xin Zhao","year":"2023","journal-title":"arXiv:2303.18223"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3672758.3672824"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/BigData59044.2023.10386743"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.807"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3261988"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3626235"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2024.3361474"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3351601"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3427488"},{"key":"ref15","article-title":"Unified multimodal understanding and generation models: Advances, challenges, and opportunities","author":"Zhang","year":"2025","journal-title":"arXiv:2505.02567"},{"key":"ref16","article-title":"Towards unifying understanding and generation in the era of vision foundation models: A survey from the autoregression perspective","author":"Xie","year":"2024","journal-title":"arXiv:2410.22217"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"ref19","article-title":"LLaVA-OneVision: Easy visual task transfer","author":"Li","year":"2024","journal-title":"arXiv:2408.03326"},{"key":"ref20","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. ICML","author":"Radford"},{"key":"ref21","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"Proc. ICLR","author":"Dosovitskiy","year":"2021"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref23","first-page":"1","article-title":"Generating diverse high-fidelity images with VQ-VAE-2","volume-title":"Proc. NeurIPS","author":"Razavi"},{"key":"ref24","article-title":"VideoGPT: Video generation using VQ-VAE and transformers","author":"Yan","year":"2021","journal-title":"arXiv:2104.10157"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref26","article-title":"Vector-quantized image modeling with improved VQGAN","volume-title":"Proc. ICLR","author":"Yu","year":"2022"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref28","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. ICML","author":"Li","year":"2023"},{"key":"ref29","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv:2312.11805"},{"key":"ref30","first-page":"1059","article-title":"High-performance large-scale image recognition without normalization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Brock"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref33","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. ICLR","author":"Zhu","year":"2024"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01311"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02491-7"},{"key":"ref36","volume-title":"Fuyu-8b: A Unified Multimodal Agent for Image and Text Understanding","author":"AI","year":"2023"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01326"},{"key":"ref39","first-page":"71737","article-title":"OMG-LLaVA: Bridging image-level, object-level, pixel-level reasoning and understanding","volume-title":"Proc. NIPS","author":"Zhang","year":"2024"},{"key":"ref40","first-page":"1","article-title":"VisionLLM: Large language model is also an open-ended decoder for vision-centric tasks","volume-title":"Proc. NeurIPS","author":"Wang"},{"key":"ref41","article-title":"Vitron: A unified pixel-level vision LLM for understanding, generating, segmenting, editing","author":"Fei","year":"2024","journal-title":"arXiv:2412.19806"},{"key":"ref42","article-title":"A survey on hallucination in large vision-language models","author":"Liu","year":"2024","journal-title":"arXiv:2402.00253"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1036\/1097-8542.253500"},{"key":"ref44","first-page":"24185","article-title":"InternVL: Scaling up vision foundation models and aligning for generic visual-linguistic tasks","volume-title":"Proc. CVPR","author":"Chen"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02553"},{"key":"ref46","first-page":"3008","article-title":"Learning to summarize with human feedback","volume-title":"Proc. NeurIPS","volume":"33","author":"Stiennon"},{"key":"ref47","article-title":"Emu3: Next-token prediction is all you need","author":"Wang","year":"2024","journal-title":"arXiv:2409.18869"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3566695"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref54","article-title":"VideoLLaMA 2: Advancing spatial\u2013temporal modeling and audio understanding in video-LLMs","author":"Cheng","year":"2024","journal-title":"arXiv:2406.07476"},{"key":"ref55","article-title":"Qwen2.5-VL technical report","volume-title":"arXiv:2502.13923","author":"Bai","year":"2025"},{"key":"ref56","article-title":"InternVL3: Exploring advanced training and test-time recipes for open-source multimodal models","author":"Zhu","year":"2025","journal-title":"arXiv:2504.10479"},{"key":"ref57","article-title":"Grounding-prompter: Prompting LLM with multimodal information for temporal sentence grounding in long videos","author":"Chen","year":"2023","journal-title":"arXiv:2312.17117"},{"key":"ref58","article-title":"LLM4VG: Large language models evaluation for video grounding","author":"Feng","year":"2023","journal-title":"arXiv:2312.14206"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"ref60","article-title":"World model on million-length video and language with blockwise RingAttention","volume-title":"Proc. ICLR","author":"Liu","year":"2025"},{"key":"ref61","article-title":"Long context transfer from language to vision","author":"Zhang","year":"2024","journal-title":"arXiv:2406.16852"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00311"},{"key":"ref64","first-page":"1","article-title":"A survey on speech large language models for understanding","volume":"2024","author":"Peng","year":"2024","journal-title":"Authorea Preprints"},{"key":"ref65","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref69","first-page":"1","article-title":"Neural discrete representation learning","volume-title":"Proc. NeurIPS","author":"Oord"},{"key":"ref70","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2022","journal-title":"TMLR"},{"key":"ref71","article-title":"VALL-E 2: Neural codec language models are human parity zero-shot text to speech synthesizers","author":"Chen","year":"2024","journal-title":"arXiv:2406.05370"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"ref73","article-title":"AudioPaLM: A large language model that can speak and listen","author":"Rubenstein","year":"2023","journal-title":"arXiv:2306.12925"},{"key":"ref74","article-title":"Generative adversarial networks","volume-title":"Proc. NIPS","volume":"27","author":"Goodfellow","year":"2014"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.299"},{"key":"ref76","first-page":"1","article-title":"Generating videos with scene dynamics","volume-title":"Proc. NeurIPS","author":"Vondrick"},{"key":"ref77","first-page":"1","article-title":"Auto-encoding variational Bayes","volume-title":"Proc. ICLR","author":"Kingma"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123309"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12233"},{"key":"ref80","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. NIPS","volume":"33","author":"Ho"},{"key":"ref81","article-title":"Denoising diffusion implicit models","volume-title":"Proc. ICLR","author":"Song","year":"2021"},{"key":"ref82","first-page":"1060","article-title":"Generative adversarial text to image synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Reed"},{"key":"ref83","article-title":"Localized text-to-image generation for free via cross attention control","author":"He","year":"2023","journal-title":"arXiv:2306.14636"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"ref85","first-page":"1","article-title":"Flow matching for generative modeling","volume-title":"Proc. ICLR","author":"Lipman"},{"key":"ref86","article-title":"Text-to-image diffusion models in generative AI: A survey","author":"Zhang","year":"2023","journal-title":"arXiv:2303.07909"},{"key":"ref87","first-page":"16784","article-title":"GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models","volume-title":"Proc. ICML","author":"Nichol","year":"2022"},{"key":"ref88","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume-title":"Proc. NIPS","volume":"35","author":"Saharia"},{"key":"ref89","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"Ramesh","year":"2022","journal-title":"arXiv:2204.06125"},{"key":"ref90","article-title":"DisenBooth: Identity-preserving disentangled tuning for subject-driven text-to-image generation","volume-title":"Proc. ICLR","author":"Chen","year":"2024"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2025.3557634"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680637"},{"key":"ref93","first-page":"8780","article-title":"Diffusion models beat GANs on image synthesis","volume-title":"Proc. NIPS","volume":"34","author":"Dhariwal"},{"key":"ref94","article-title":"Classifier-free diffusion guidance","volume-title":"Workshop NIPS","author":"Ho","year":"2021"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref96","article-title":"PixArt-\u03b1: Fast training of diffusion transformer for photorealistic text-to-image synthesis","volume-title":"Proc. ICLR","author":"Chen","year":"2024"},{"issue":"140","key":"ref97","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2019","journal-title":"J. Mach. Learn. Res."},{"key":"ref98","first-page":"1","article-title":"Scaling rectified flow transformers for high-resolution image synthesis","volume-title":"Proc. 41st ICML","author":"Esser"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"ref100","article-title":"Latent-shift: Latent diffusion with temporal shift for efficient text-to-video generation","author":"An","year":"2023","journal-title":"arXiv:2304.08477"},{"key":"ref101","first-page":"8633","article-title":"Video diffusion models","volume-title":"Proc. NeurIPS","volume":"35","author":"Ho"},{"key":"ref102","first-page":"1","article-title":"Make-A-video: Text-to-video generation without text-video data","volume-title":"Proc. ICLR","author":"Singer"},{"key":"ref103","first-page":"1","article-title":"AnimateDiff: Animate your personalized text-to-image diffusion models without specific tuning","volume-title":"Proc. ICLR","author":"Guo"},{"key":"ref104","article-title":"Latte: Latent diffusion transformer for video generation","author":"Ma","year":"2025","journal-title":"TMLR"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00616"},{"key":"ref106","article-title":"A survey on audio diffusion models: Text to speech synthesis and enhancement in generative AI","author":"Zhang","year":"2023","journal-title":"arXiv:2303.13336"},{"key":"ref107","article-title":"DiffWave: A versatile diffusion model for audio synthesis","volume-title":"Proc. ICLR","author":"Kong","year":"2021"},{"key":"ref108","article-title":"WaveGrad: Estimating gradients for waveform generation","volume-title":"Proc. ICLR","author":"Chen","year":"2021"},{"key":"ref109","first-page":"8599","article-title":"Grad-TTS: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Popov"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-469"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1897"},{"key":"ref112","article-title":"TTS-1 technical report","volume-title":"arXiv:2507.21138","author":"Atamanenko","year":"2025"},{"key":"ref113","article-title":"MiniMax-speech: Intrinsic zero-shot text-to-speech with a learnable speaker encoder","author":"Zhang","year":"2025","journal-title":"arXiv:2505.07916"},{"key":"ref114","article-title":"VL-GPT: A generative pre-trained transformer for vision and language understanding and generation","author":"Zhu","year":"2023","journal-title":"arXiv:2312.09251"},{"key":"ref115","article-title":"Autoregressive model beats diffusion: Llama for scalable image generation","author":"Sun","year":"2024","journal-title":"arXiv:2406.06525"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.521"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01210"},{"key":"ref119","first-page":"84839","article-title":"Visual autoregressive modeling: Scalable image generation via next-scale prediction","volume-title":"Proc. NIPS","volume":"37","author":"Tian","year":"2024"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01464"},{"key":"ref122","article-title":"Janus-pro: Unified multimodal understanding and generation with data and model scaling","author":"Chen","year":"2025","journal-title":"arXiv:2501.17811"},{"key":"ref123","article-title":"Mogao: An omni foundation model for interleaved multi-modal generation","author":"Liao","year":"2025","journal-title":"arXiv:2505.05472"},{"key":"ref124","article-title":"Emerging properties in unified multimodal pretraining","author":"Deng","year":"2025","journal-title":"arXiv:2505.14683"},{"key":"ref125","article-title":"Visual ChatGPT: Talking, drawing and editing with visual foundation models","author":"Wu","year":"2023","journal-title":"arXiv:2303.04671"},{"key":"ref126","first-page":"1","article-title":"HuggingGPT: Solving AI tasks with ChatGPT and its friends in hugging face","volume-title":"Proc. NeurIPS","author":"Shen"},{"key":"ref127","article-title":"Tool-LMM: A large multi-modal model for tool agent learning","author":"Wang","year":"2024","journal-title":"arXiv:2401.10727"},{"key":"ref128","article-title":"Kosmos-G: Generating images in context with multimodal large language models","volume-title":"Proc. ICLR","author":"Pan","year":"2024"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02589"},{"key":"ref130","article-title":"SEED-X: Multimodal models with unified multi-granularity comprehension and generation","author":"Ge","year":"2024","journal-title":"arXiv:2404.14396"},{"key":"ref131","article-title":"BLIP3-o: A family of fully open unified multimodal models-architecture, training and dataset","author":"Chen","year":"2025","journal-title":"arXiv:2505.09568"},{"key":"ref132","article-title":"Qwen-image technical report","volume-title":"arXiv:2508.02324","author":"Wu","year":"2025"},{"key":"ref133","article-title":"Transfusion: Predict the next token and diffuse images with one multi-modal model","volume-title":"Proc. ICLR","author":"Zhou","year":"2025"},{"key":"ref134","article-title":"Show-o: One single transformer to unify multimodal understanding and generation","volume-title":"Proc. ICLR","author":"Xie","year":"2025"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01008"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW67362.2025.00345"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"ref139","article-title":"LMFusion: Adapting pretrained language models for multimodal generation","author":"Shi","year":"2024","journal-title":"arXiv:2412.15188"},{"key":"ref140","article-title":"Qwen-VL: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023","journal-title":"arXiv:2308.12966"},{"key":"ref141","volume-title":"The Claude 3 Model Family: Opus, Sonnet, Haiku","year":"2024"},{"key":"ref142","article-title":"MiniCPM-V: A GPT-4V level MLLM on your phone","author":"Yao","year":"2024","journal-title":"arXiv:2408.01800"},{"key":"ref143","article-title":"VILA: On pre-training for visual language models","author":"Lin","year":"2023","journal-title":"arXiv:2312.07533"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0795"},{"key":"ref145","article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"Proc. ICLR","author":"Tang","year":"2024"},{"key":"ref146","article-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models","author":"Chu","year":"2023","journal-title":"arXiv:2311.07919"},{"key":"ref147","article-title":"OSUM: Advancing open speech understanding models with limited resources in academia","author":"Geng","year":"2025","journal-title":"arXiv:2501.13306"},{"key":"ref148","first-page":"1","article-title":"CogVideo: Large-scale pretraining for text-to-video generation via transformers","volume-title":"Proc. ICLR","author":"Hong"},{"key":"ref149","article-title":"Wan: Open and advanced large-scale video generative models","author":"Wan","year":"2025","journal-title":"arXiv:2503.20314"},{"key":"ref150","article-title":"HunyuanVideo: A systematic framework for large video generative models","author":"Kong","year":"2024","journal-title":"arXiv:2412.03603"},{"key":"ref151","article-title":"Vidu: A highly consistent, dynamic and skilled text-to-video generator with diffusion models","author":"Bao","year":"2024","journal-title":"arXiv:2405.04233"},{"key":"ref152","article-title":"Skywork UniPic: Unified autoregressive modeling for visual understanding and generation","author":"Wang","year":"2025","journal-title":"arXiv:2508.03320"},{"key":"ref153","article-title":"Transfer between modalities with MetaQueries","author":"Pan","year":"2025","journal-title":"arXiv:2504.06256"},{"key":"ref154","article-title":"OmniGen2: Exploration to advanced multimodal generation","author":"Wu","year":"2025","journal-title":"arXiv:2506.18871"},{"key":"ref155","article-title":"Qwen2.5-omni technical report","volume-title":"arXiv:2503.20215","author":"Xu","year":"2025"},{"key":"ref156","article-title":"Qwen3-omni technical report","volume-title":"arXiv:2509.17765","author":"Xu","year":"2025"},{"key":"ref157","article-title":"Ming-omni: A unified multimodal model for perception and generation","author":"Gong","year":"2025","journal-title":"arXiv:2506.09344"},{"key":"ref158","article-title":"Show-o2: Improved native unified multimodal models","author":"Xie","year":"2025","journal-title":"arXiv:2506.15564"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2015.66"},{"key":"ref160","first-page":"1143","article-title":"Im2Text: Describing images using 1 million captioned photographs","volume-title":"Proc. NeurIPS","volume":"24","author":"Ord\u00f3\u00f1ez"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref163","first-page":"25278","article-title":"LAION-5B: An open large-scale dataset for training next generation image-text models","volume-title":"Proc. NeurIPS","author":"Schuhmann"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1160"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref166","first-page":"1","article-title":"InternVid: A large-scale video-text dataset for multimodal understanding and generation","volume-title":"Proc. 12th ICLR","author":"Wang"},{"key":"ref167","first-page":"1","article-title":"VideoFactory: Swap attention in spatiotemporal diffusions for text-to-video generation","volume-title":"Proc. ICLR","author":"Wang"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110818"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.149"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00171"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i15.17635"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"ref181","first-page":"1","article-title":"CLEVRER: CoLlision events for video REpresentation and reasoning","volume-title":"Proc. ICLR","author":"Yi"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3510735"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3514820"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3349567"},{"key":"ref186","article-title":"NExT-GPT: Any-to-any multimodal LLM","volume-title":"Proc. ICML","author":"Wu","year":"2024"},{"key":"ref187","article-title":"X-VILA: Cross-modality alignment for large language model","author":"Ye","year":"2024","journal-title":"arXiv:2405.19335"},{"key":"ref188","first-page":"1","article-title":"VideoPoet: A large language model for zero-shot video generation","volume-title":"Proc. ICML","author":"Kondratyuk"},{"key":"ref189","article-title":"Language model beats diffusion\u2014Tokenizer is key to visual generation","volume-title":"Proc. ICLR","author":"Yu","year":"2024"},{"key":"ref190","article-title":"Video-LaVIT: Unified video-language pre-training with decoupled visual-motional tokenization","volume-title":"Proc. ICML","author":"Jin","year":"2024"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2270"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1093\/bib\/bbab340"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1016\/j.fmre.2024.11.027"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127052"},{"key":"ref196","article-title":"Exploring the potential of large language models in graph generation","author":"Yao","year":"2024","journal-title":"arXiv:2403.14358"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1016\/j.compbiomed.2024.108073"},{"key":"ref198","article-title":"Multimodal graph benchmark","author":"Zhu","year":"2024","journal-title":"arXiv:2406.16321"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/mis.2026.3650817"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00624-6"},{"key":"ref201","article-title":"Multimodal graph learning for generative tasks","volume-title":"Workshop NIPS","author":"Yoon","year":"2023"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73016-0_6"},{"key":"ref203","first-page":"32211","article-title":"Consistency models","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Song"},{"key":"ref204","article-title":"Latent consistency models: Synthesizing high-resolution images with few-step inference","author":"Luo","year":"2023","journal-title":"arXiv:2310.04378"},{"key":"ref205","first-page":"1","article-title":"Flow straight and fast: Learning to generate and transfer data with rectified flow","volume-title":"Proc. ICLR","author":"Liu"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73010-8_15"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00147"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00196"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72992-8_23"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01608"},{"key":"ref211","article-title":"LAPTOP-diff: Layer pruning and normalized distillation for compressing diffusion models","author":"Zhang","year":"2024","journal-title":"arXiv:2404.11098"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01492"},{"key":"ref213","article-title":"\u0394-DiT: A training-free acceleration method tailored for diffusion transformers","author":"Chen","year":"2024","journal-title":"arXiv:2406.01125"},{"key":"ref214","article-title":"Lightweight diffusion models with distillation-based block neural architecture search","author":"Tang","year":"2023","journal-title":"arXiv:2311.04950"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00654"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-025-00099-6"},{"key":"ref217","article-title":"Mini-gemini: Mining the potential of multi-modality vision language models","author":"Li","year":"2024","journal-title":"arXiv:2403.18814"},{"key":"ref218","article-title":"MoE-LLaVA: Mixture of experts for large vision-language models","author":"Lin","year":"2024","journal-title":"arXiv:2401.15947"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i10.33131"},{"key":"ref220","first-page":"38087","article-title":"SmoothQuant: Accurate and efficient post-training quantization for large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Xiao"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1016\/j.birob.2023.100131"},{"key":"ref222","first-page":"1","article-title":"EmbodiedGPT: Vision-language pre-training via embodied chain of thought","volume-title":"Proc. NeurIPS","author":"Mu"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2022.06.001"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/76\/11475579\/11261902.pdf?arnumber=11261902","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T20:04:15Z","timestamp":1775592255000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11261902\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":223,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2025.3635224","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4]]}}}