{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T06:09:32Z","timestamp":1773814172386,"version":"3.50.1"},"reference-count":32,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Key Technology Research and Development Program of the Zhejiang Province","award":["2022C01125"],"award-info":[{"award-number":["2022C01125"]}]},{"name":"Zhejiang Province High-Level Talent Special Support Program-Leading Talent of Technological Innovation","award":["2022R52043"],"award-info":[{"award-number":["2022R52043"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Rel."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tr.2026.3661996","type":"journal-article","created":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T21:04:47Z","timestamp":1770671087000},"page":"1281-1290","source":"Crossref","is-referenced-by-count":0,"title":["TriVLLo: Tri-View Dynamic Architecture and Unified Cross-Modal Representation for Efficient Fine-Grained Vision\u2013Language Understanding"],"prefix":"10.1109","volume":"75","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4625-4271","authenticated-orcid":false,"given":"Liang","family":"Kou","sequence":"first","affiliation":[{"name":"College of Cyberspace, Hangzhou Dianzi University, Hang Zhou, China"}]},{"given":"Wenlong","family":"Fan","sequence":"additional","affiliation":[{"name":"College of Cyberspace, Hangzhou Dianzi University, Hang Zhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3971-8434","authenticated-orcid":false,"given":"Xingru","family":"Huang","sequence":"additional","affiliation":[{"name":"College of Communication Engineering, Hangzhou Dianzi University, Hangzhou, China"}]},{"given":"Bai","family":"Lin","sequence":"additional","affiliation":[{"name":"Systems Engineering Institute, AMS, PLA, Beijing, China"}]},{"given":"Bo","family":"Yang","sequence":"additional","affiliation":[{"name":"Systems Engineering Institute, AMS, PLA, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0241-0727","authenticated-orcid":false,"given":"Jilin","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Cyberspace, Hangzhou Dianzi University, Hang Zhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4002-1282","authenticated-orcid":false,"given":"Yun","family":"Lin","sequence":"additional","affiliation":[{"name":"College of Information and Communication Engineering, Harbin Engineering University, Harbin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2025-721X","authenticated-orcid":false,"given":"Meiyu","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Communication Engineering, Hangzhou Dianzi University, Hangzhou, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref3","first-page":"15447","article-title":"PUMA: Empowering unified MLLM with multi-granular visual generation","volume-title":"Proc. IEEECVF Int. Conf. Comput. Vis.","author":"Fang"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TR.2020.3032744"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1656"},{"key":"ref6","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref7","article-title":"CoCa: Contrastive captioners are image-text foundation models","author":"Yu","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref8","first-page":"57730","article-title":"MM-Vet: Evaluating large multimodal models for integrated capabilities","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Yu"},{"key":"ref9","article-title":"The dawn of LMMs: Preliminary explorations with GPT-4V(ision)","author":"Yang","year":"2023"},{"key":"ref10","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref11","first-page":"22185","article-title":"Video-LaVIT: Unified video-language pre-training with decoupled visual-motional tokenization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jin"},{"key":"ref12","article-title":"SEED-X: Multimodal models with unified multi-granularity comprehension and generation","author":"Ge","year":"2024"},{"key":"ref13","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Zhu"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.207"},{"key":"ref15","article-title":"Visual attention never fades: Selective progressive attention ReCalibration for detailed image captioning in multimodal large language models","author":"Jung","year":"2025"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00725"},{"key":"ref17","first-page":"24824","article-title":"Chain-of- thought prompting elicits reasoning in large language models","volume-title":"Proc. 36th Int. Conf. Neural Inf. Process. Syst.","author":"Wei"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref19","first-page":"8469","article-title":"PaLM-E: An embodied multimodal language model","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Driess"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref23","article-title":"MME: A comprehensive evaluation benchmark for multimodal large language models","volume-title":"Proc. Neural Inf. Process. Syst. DatasetsBenchmarks Track","author":"Fu"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1160"},{"key":"ref26","article-title":"Qwen3 technical report","author":"Yang","year":"2025"},{"key":"ref27","article-title":"SigLIP 2: Multilingual vision-language encoders with improved semantic understanding, localization, and dense features","author":"Tschannen","year":"2025"},{"key":"ref28","article-title":"LLaMA-Adapter: Efficient fine-tuning of language models with zero-init attention","author":"Zhang","year":"2023"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2142"},{"key":"ref30","article-title":"GPT-4o system card","author":"Hurst","year":"2024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"ref32","article-title":"Qwen2-VL: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024"}],"container-title":["IEEE Transactions on Reliability"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/24\/11317936\/11382543.pdf?arnumber=11382543","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T05:15:48Z","timestamp":1773810948000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11382543\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/tr.2026.3661996","relation":{},"ISSN":["0018-9529","1558-1721"],"issn-type":[{"value":"0018-9529","type":"print"},{"value":"1558-1721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}