{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T21:38:38Z","timestamp":1770845918695,"version":"3.50.1"},"reference-count":120,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,5]],"date-time":"2025-10-05T00:00:00Z","timestamp":1759622400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,5]],"date-time":"2025-10-05T00:00:00Z","timestamp":1759622400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,5]]},"DOI":"10.1109\/smc58881.2025.11342688","type":"proceedings-article","created":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T20:54:44Z","timestamp":1769633684000},"page":"2518-2525","source":"Crossref","is-referenced-by-count":0,"title":["On the Taxonomy, Tasks, and Open-Challenges for Multimodal Large Language Models"],"prefix":"10.1109","author":[{"given":"Lecheng","family":"Yan","sequence":"first","affiliation":[{"name":"Xinjiang University,School of Computer Science and Technology,Urumqi,China,830046"}]},{"given":"Ruizhe","family":"Li","sequence":"additional","affiliation":[{"name":"University of Aberdeen,Aberdeen,United Kingdom"}]},{"given":"Jiahui","family":"Geng","sequence":"additional","affiliation":[{"name":"MBZUAI,Abu Dhabi,United Arab Emirates"}]},{"given":"Qing","family":"Li","sequence":"additional","affiliation":[{"name":"MBZUAI,Abu Dhabi,United Arab Emirates"}]},{"given":"Minghao","family":"Wu","sequence":"additional","affiliation":[{"name":"Monash University,Melbourne,Australia"}]},{"given":"Zhanyu","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Sydney,Sydney,Australia"}]},{"given":"Wenxi","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University,Beijing,China"}]},{"given":"Tianbo","family":"Ji","sequence":"additional","affiliation":[{"name":"Nantong University,Nantong,China"}]},{"given":"Shaochen","family":"Jiang","sequence":"additional","affiliation":[{"name":"Xinjiang University,School of Computer Science and Technology,Urumqi,China,830046"}]},{"given":"Chenyang","family":"Lyu","sequence":"additional","affiliation":[{"name":"Alibaba Group,Hangzhou,China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Training language models to follow instructions with human feedback","volume-title":"NeurIPS","author":"Ouyang"},{"key":"ref2","article-title":"GPT-4 Technical Report","year":"2023"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610443"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.ijcnlp-main.45"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1561\/0600000110"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403"},{"key":"ref7","article-title":"Learning transferable visual models from natural language supervision","volume-title":"ICML","author":"Radford"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02572-7"},{"key":"ref9","article-title":"Gpt-4o system card","author":"Hurst","year":"2024"},{"key":"ref10","article-title":"Gemini: A family of Highly Capable Multimodal Models","year":"2023"},{"key":"ref11","article-title":"Hierarchical Text-Conditional Image Generation with CliP latents","author":"Ramesh","year":"2022"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"ref13","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"ICML","author":"Li"},{"key":"ref14","article-title":"Visual Instruction Tuning","volume-title":"NeurIPS","author":"Liu"},{"key":"ref15","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Zhu"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref17","article-title":"VideoCrafter1: Open Diffusion Models for High-Quality Video Generation","author":"Chen","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"ref19","article-title":"The Claude 3 Model Family: Opus, Sonnet, Haiku","year":"2024"},{"key":"ref20","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"ref21","article-title":"Language Models are Unsupervised Multitask Learners","volume-title":"Technical Report","author":"Radford","year":"2019"},{"key":"ref22","article-title":"Audiobox: Unified Audio Generation with Natural Language Prompts","author":"Vyas","year":"2023"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref24","article-title":"Any-to-Any Generation via Composable Diffusion","author":"Tang","year":"2023"},{"key":"ref25","article-title":"Emu: Enhancing Image Generation Models Using Photogenic Needles in a Haystack","author":"Dai","year":"2023"},{"key":"ref26","article-title":"AppAgent: Multimodal Agents as Smartphone Users","author":"Yang","year":"2023"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.33"},{"key":"ref28","article-title":"Llava-med: Training a large language-and-vision assistant for biomedicine in one day","author":"Li","year":"2023"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.teachingnlp-1.14"},{"key":"ref30","article-title":"InstructBliP: Towards General-purpose Vision-Language Models with Instruction Tuning","author":"Dai","year":"2023"},{"key":"ref31","article-title":"PandaGpt: One Model To Instruction-Follow Them All","author":"Su","year":"2023"},{"key":"ref32","article-title":"Macaw-Llm: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration","author":"Lyu","year":"2023"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref34","article-title":"Valley: Video Assistant with Large Language model Enhanced abilitY","author":"Luo","year":"2023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref37","article-title":"GPt-4V(ision) System Card","year":"2023"},{"key":"ref38","article-title":"CogVlm: Visual Expert for Pretrained Language Models","author":"Wang","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3637265"},{"key":"ref40","article-title":"Qwen-Vl: A versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","author":"Bai","year":"2023"},{"key":"ref41","article-title":"DeepSeek-Vl: Towards Real-World Vision-Language Understanding","author":"Lu","year":"2024"},{"key":"ref42","article-title":"Yi: Open Foundation Models by 01.Ai","author":"Young","year":"2024"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2026.3653415"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2789"},{"key":"ref45","article-title":"MiniCpm: Unveiling the Potential of Small Language Models with Scalable Training Strategies","author":"Hu","year":"2024"},{"key":"ref46","article-title":"VItrOn: A unified Pixel-level Vision Llm for Understanding, Generating, Segmenting, Editing","author":"Fei","year":"2024"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3531452"},{"key":"ref48","article-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling","author":"Chen","year":"2024"},{"key":"ref49","article-title":"Qwen2. 5-omni technical report","author":"Xu","year":"2025"},{"key":"ref50","article-title":"Zero-Shot Text-to-Image Generation","author":"Ramesh","year":"2021"},{"key":"ref51","article-title":"Improving image generation with better captions","author":"Betker","year":"2023"},{"key":"ref52","article-title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","author":"Blattmann","year":"2023"},{"key":"ref53","article-title":"LLm-grounded Video Diffusion Models","author":"Lian","year":"2023"},{"key":"ref54","first-page":"93","article-title":"Maxfusion: Plug&play multi-modal generation in text-to-image diffusion models","volume-title":"ECCV","author":"Nair"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_12"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02346-1"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00639"},{"key":"ref58","article-title":"ANolE: An Open, Autoregressive, Native Large Multimodal Models for Interleaved Image-Text Generation","author":"Chern","year":"2024"},{"key":"ref59","article-title":"Chameleon: Mixed-modal early-fusion foundation models","year":"2024"},{"key":"ref60","article-title":"Transfusion: Predict the next token and diffuse images with one multi-modal model","author":"Zhou","year":"2024"},{"key":"ref61","article-title":"NExt-Gpt: Any-to-Any Multimodal Llm","author":"Wu","year":"2023"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.521"},{"key":"ref63","article-title":"Emu3: Next-Token Prediction is All You Need","author":"Wang","year":"2024"},{"key":"ref64","first-page":"arXiv","article-title":"Llamafusion: Adapting pretrained language models for multimodal generation","author":"Shi","year":"2024"},{"key":"ref65","article-title":"Vita-1.5: Towards gpt-4o level real-time vision and speech interaction","author":"Fu","year":"2025"},{"key":"ref66","article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024"},{"key":"ref67","article-title":"Yi: Open foundation models by 01.ai","year":"2024"},{"key":"ref68","article-title":"Efficient multimodal learning from data-centric perspective","author":"He","year":"2024"},{"key":"ref69","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","author":"A","year":"2024"},{"key":"ref70","article-title":"Neural machine translation by jointly learning to align and translate","volume-title":"ICLR","author":"Bahdanau"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref75","article-title":"CVqa: Culturally-diverse Multilingual Visual Question Answering Benchmark","author":"Romero","year":"2024"},{"key":"ref76","article-title":"Kosmos-G: Generating Images in Context with Multimodal Large Language Models","volume-title":"ICLR","author":"Pan"},{"key":"ref77","article-title":"Attention Is All You Need","author":"Vaswani","year":"2017","journal-title":"CoRR"},{"key":"ref78","article-title":"Can Multimodal Llms do Visual Temporal Understanding and Reasoning? The answer is No!","author":"Imam","year":"2025"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681464"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01188"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"ref82","article-title":"An Llm-free Multi-dimensional Benchmark for MllMs Hallucination Evaluation","author":"Wang","year":"2023"},{"key":"ref83","article-title":"Analyzing and Mitigating Object Hallucination in Large Vision-Language Models","volume-title":"ICLR","author":"Zhou"},{"key":"ref84","article-title":"HallE-switch: Rethinking and Controlling Object Existence Hallucinations in Large Vision Language Models for Detailed Caption","author":"Zhai","year":"2023"},{"key":"ref85","article-title":"MMe: A comprehensive Evaluation Benchmark for Multimodal Large Language Models","author":"Fu","year":"2023"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref87","article-title":"Evaluation and analysis of hallucination in large vision-language models","author":"Wang","year":"2023"},{"key":"ref88","article-title":"Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning","author":"Liu","year":"2023"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29771"},{"key":"ref90","article-title":"Qwen-vl: A frontier large vision-language model with versatile abilities","author":"Bai","year":"2023"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02553"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01274"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/icme59968.2025.11209377"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-2074"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1437"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/ijcnn64981.2025.11228391"},{"key":"ref97","article-title":"Retrieval-Augmented Generation for Large Language Models: A survey","author":"Gao","year":"2024"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.64"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-024-51465-9"},{"key":"ref100","article-title":"MMicL: Empowering Vision-language Model with Multi-Modal In-Context Learning","volume-title":"ICLR","author":"Zhao"},{"key":"ref101","article-title":"Many-Shot In-Context Learning in Multimodal Foundation Models","author":"Jiang","year":"2024"},{"key":"ref102","first-page":"4174","article-title":"Cross-Lingual Transfer Robustness to Lower-Resource Languages on Adversarial Datasets","author":"Manafi","year":"2024","journal-title":"LREC-COLING, (Torino, Italia)"},{"key":"ref103","article-title":"Trans-Tokenization and Cross-lingual Vocabulary Transfers: Language Adaptation of Llms for Low-Resource Nlp","author":"Remy","year":"2024"},{"key":"ref104","first-page":"11917","article-title":"Multimodal Cross-lingual Phrase Retrieval","volume-title":"LREC-COLING","author":"Dong","year":"2024"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/s10676-022-09624-3"},{"key":"ref106","article-title":"Survey on Ai Ethics: A socio-technical Perspective","author":"Mbiazi","year":"2023"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1007\/s00146-023-01644-x"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1250"},{"key":"ref109","article-title":"Jailbreak and guard aligned language models with only few in-context demonstrations","author":"Wei","year":"2023"},{"key":"ref110","article-title":"Jailbreaking chatgpt via prompt engineering: An empirical study","author":"Liu","year":"2023"},{"key":"ref111","article-title":"Universal and transferable adversarial attacks on aligned language models","author":"Zou","year":"2023"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i19.30150"},{"key":"ref113","article-title":"Jailbreaking attack against multimodal large language model","author":"Niu","year":"2024"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i22.34568"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01350"},{"key":"ref116","article-title":"Camel: Communicative agents for\u201d mind\" exploration of large scale language model society","author":"Li","year":"2023"},{"key":"ref117","article-title":"Autogen: Enabling next-gen llm applications via multi-agent conversation framework","author":"Wu","year":"2023"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.992"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02482"},{"key":"ref120","article-title":"Eureka: Human-level reward design via coding large language models","author":"Ma","year":"2023"}],"event":{"name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","location":"Vienna, Austria","start":{"date-parts":[[2025,10,5]]},"end":{"date-parts":[[2025,10,8]]}},"container-title":["2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11342430\/11342431\/11342688.pdf?arnumber=11342688","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T20:52:04Z","timestamp":1770843124000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11342688\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,5]]},"references-count":120,"URL":"https:\/\/doi.org\/10.1109\/smc58881.2025.11342688","relation":{},"subject":[],"published":{"date-parts":[[2025,10,5]]}}}