{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,5]],"date-time":"2026-02-05T11:57:51Z","timestamp":1770292671997,"version":"3.49.0"},"reference-count":35,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-009"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-001"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Computer"],"published-print":{"date-parts":[[2024,11]]},"DOI":"10.1109\/mc.2024.3445515","type":"journal-article","created":{"date-parts":[[2024,10,15]],"date-time":"2024-10-15T17:21:27Z","timestamp":1729012887000},"page":"124-130","source":"Crossref","is-referenced-by-count":1,"title":["Fusing AI: Multimodal Language Models Inference Across Diverse Inputs"],"prefix":"10.1109","volume":"57","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2355-9424","authenticated-orcid":false,"given":"Mla\u0111an","family":"Jovanovi\u0107","sequence":"first","affiliation":[{"name":"Singidunum University, Belgrade, Serbia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5415-6631","authenticated-orcid":false,"given":"Mark","family":"Campbell","sequence":"additional","affiliation":[{"name":"3dot Insights, Sedalia, CO, USA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"crossref","DOI":"10.1093\/nsr\/nwae403","article-title":"A survey on multimodal large language models","author":"Yin","year":"2024"},{"key":"ref2","volume-title":"LANISTR: Multimodal learning from structured and unstructured data","author":"Ebrahimi","year":"2024"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580959"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.738"},{"key":"ref5","volume-title":"MM1: Methods, analysis & insights from","author":"McKinzie","year":"2024"},{"key":"ref6","volume-title":"Introduction to MM LLMs.","author":"Reganti","year":"2024"},{"key":"ref7","volume-title":"Multimodality and large multimodal models (LMMs).","author":"Huyen","year":"2024"},{"key":"ref8","volume-title":"An Introduction to Vision-Language Modeling.","author":"Bordes","year":"2024"},{"key":"ref9","volume-title":"Interviewee","author":"Jain","year":"2024"},{"key":"ref10","volume-title":"FineWeb: Decanting the web for the finest text data at scale.","author":"Penedo","year":"2024"},{"key":"ref11","volume-title":"Many-Shot In-Context Learning in Multimodal Foundation Models.","author":"Jiang","year":"2024"},{"key":"ref12","volume-title":"Vision Mamba: Efficient Visual Representation Learning With Bidirectional State Space Model.","author":"Zhu","year":"2024"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3386927"},{"key":"ref14","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.emnlp-main.444","volume-title":"Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations?","author":"Gekhman","year":"2024"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/tkde.2024.3352100"},{"key":"ref16","volume-title":"RAG-Driver: Generalisable Driving Explanations With Retrieval-Augmented In-Context Learning in Multi-Modal Large Language Models.","author":"Yuan","year":"2024"},{"key":"ref17","first-page":"1","article-title":"Efficient streaming language models with attention sinks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Xiao","year":"2024"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref19","article-title":"SegGPT: Segmenting everything in context","volume-title":"Beijing Acad. Artif. Intell.","author":"Wang","year":"2023"},{"key":"ref20","volume-title":"Lumiere: A space-time diffusion model for video generation","author":"Bar-Tal","year":"2024"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref23","volume-title":"Kosmos-2: Grounding Multimodal Large Language Models to the World.","author":"Peng","year":"2023"},{"key":"ref24","first-page":"1","article-title":"Ferret: Refer and ground anything anywhere at any granularity","volume-title":"Proc. Int. Conf. Learn. Representations","author":"You","year":"2024"},{"key":"ref25","volume-title":"TinyLLaVA: A Framework of Small-Scale Large Multimodal Models.","author":"Zhou","year":"2024"},{"key":"ref26","volume-title":"BiomedGPT: A Unified and Generalist Biomedical Generative Pre-Trained Transformer for Vision, Language, and Multimodal Tasks.","author":"Zhang","year":"2024"},{"key":"ref27","article-title":"Gemini: A family of highly capable multimodal models","year":"2024"},{"key":"ref28","volume-title":"How Culturally Aware Are Vision-Language Models?","author":"Burda-Lassen","year":"2024"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ms.2022.3233582"},{"key":"ref30","volume-title":"Scaling Monosemanticity: Extracting Interpretable Features From Claude 3 Sonnet.","year":"2024"},{"key":"ref31","volume-title":"Is in-context learning sufficient for instruction following in LLMs?","author":"Zhao","year":"2024"},{"key":"ref32","article-title":"Towards incremental learning in large language models: A critical review","author":"Jovanovic","year":"2024"},{"key":"ref33","volume-title":"LLM Augmented LLMs: Expanding Capabilities Through Composition.","author":"Bansal","year":"2024"},{"key":"ref34","volume-title":"Evaluating the Social Impact of Generative AI Systems in Systems and Society.","author":"Solaiman","year":"2023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/mc.2022.3148642"}],"container-title":["Computer"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/2\/10718654\/10718668.pdf?arnumber=10718668","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T18:05:57Z","timestamp":1732903557000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10718668\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11]]},"references-count":35,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/mc.2024.3445515","relation":{},"ISSN":["0018-9162","1558-0814"],"issn-type":[{"value":"0018-9162","type":"print"},{"value":"1558-0814","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11]]}}}