{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:08:35Z","timestamp":1763194115981,"version":"3.45.0"},"reference-count":28,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228908","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["DLLaVA: A Novel Multimodal Architecture with Enhanced Vision Encoder and Curriculum Learning"],"prefix":"10.1109","author":[{"given":"Yuqi","family":"Mao","sequence":"first","affiliation":[{"name":"Nankai University,College of Software,Tianjin,China"}]},{"given":"Jianyu","family":"Zhou","sequence":"additional","affiliation":[{"name":"Nankai University,College of Software,Tianjin,China"}]}],"member":"263","reference":[{"article-title":"Gemini: a family of highly capable multimodal models","year":"2023","author":"Team","key":"ref1"},{"key":"ref2","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in neural information processing systems"},{"article-title":"Llava-next-interleave: Tackling multi-image, video, and 3d in large multimodal models","year":"2024","author":"Li","key":"ref3"},{"key":"ref4","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"article-title":"Dinov2: Learning robust visual features without supervision","year":"2023","author":"Oquab","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2021.3069908"},{"article-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","year":"2024","author":"Wang","key":"ref7"},{"article-title":"Enhancing the reasoning ability of multimodal large language models via mixed preference optimization","year":"2024","author":"Wang","key":"ref8"},{"article-title":"Qwen2. 5 technical report","year":"2024","author":"Yang","key":"ref9"},{"article-title":"Internlm2 technical report","year":"2024","author":"Cai","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"article-title":"Convllava: Hierarchical backbones as visual encoder for large multimodal models","year":"2024","author":"Ge","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.1009"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.112331"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/AINIT61980.2024.10581585"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"article-title":"The Llama 3 Herd of Models","year":"2024","author":"Grattafiori","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"article-title":"An image is worth more than 16x16 patches: Exploring transformers on individual pixels","year":"2024","author":"Nguyen","key":"ref20"},{"year":"2024","key":"ref21","article-title":"GPT-4o System Card"},{"article-title":"Claude 3.5 sonnet","volume-title":"Anthropic.ai, Tech. Rep.","year":"2024","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/wacv48630.2021.00225"},{"article-title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models","year":"2023","author":"Fu","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0850"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4235-6"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_22"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228908.pdf?arnumber=11228908","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:03:48Z","timestamp":1763193828000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228908\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":28,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228908","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}