{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:28:12Z","timestamp":1777865292245,"version":"3.51.4"},"reference-count":54,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00379","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"3977-3987","source":"Crossref","is-referenced-by-count":0,"title":["OpenVision: A Fully-Open, Cost-Effective Family of Advanced Vision Encoders for Multimodal Learning"],"prefix":"10.1109","author":[{"given":"Xianhang","family":"Li","sequence":"first","affiliation":[{"name":"University of California,Santa Cruz"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanqing","family":"Liu","sequence":"additional","affiliation":[{"name":"University of California,Santa Cruz"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haoqin","family":"Tu","sequence":"additional","affiliation":[{"name":"University of California,Santa Cruz"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cihang","family":"Xie","sequence":"additional","affiliation":[{"name":"University of California,Santa Cruz"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"GPT-4V(ision) system car","author":"Achiam","year":"2023","journal-title":"OpenAI Research Blog"},{"key":"ref2","article-title":"Smollm2: When smol goes big-data-centric training of a small language model","author":"Ben Allal","year":"2025","journal-title":"arXiv preprint"},{"key":"ref3","article-title":"Contrastive localized languageimage pre-trainin","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref4","author":"Chen","year":"2024","journal-title":"Open-llava-next: An opensource implementation of llava-next series for facilitating the large multi-modal model communit"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"ref6","article-title":"Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic task","author":"Chen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"Expanding performance boundaries of open-source multimodal models with model, data, and testtime scalin","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Nvlm: Open frontier-class multimodal 11 m","author":"Dai","year":"2024","journal-title":"arXiv preprint"},{"key":"ref9","article-title":"Molmo and pixmo: Open weights and open data for state-of-the-art multimodal model","author":"Deitke","year":"2024","journal-title":"arXiv preprint"},{"key":"ref10","article-title":"Imagenet: A large-scale hierarchical image databas","author":"Deng","year":"2009","journal-title":"CVPR"},{"key":"ref11","article-title":"Data fil-tering network","author":"Fang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00901"},{"key":"ref13","article-title":"Mme: A comprehensive evaluation benchmark for multimodal large language model","author":"Fu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref14","article-title":"Datacomp: In search of the next generation of multimodal dataset","author":"Yitzhak Gadre","year":"2023","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref16","author":"Ilharco","year":"2021","journal-title":"Openclip. github"},{"key":"ref17","article-title":"Seed-bench: Benchmarking multimodal 11 ms with generative comprehensio","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref18","article-title":"Llava-onevision: Easy visual task transfe","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref19","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillatio","volume":"34","author":"Li","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref20","article-title":"Reclip: Resource-efficient clip by training with small image","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref21","article-title":"An inverse scaling law for clip trainin","author":"Li","year":"2023","journal-title":"NeurIPS"},{"key":"ref22","article-title":"Clipa-v2: Scaling clip training with 81.1","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref23","article-title":"What if we recaption billions of web images with llama-3","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"ref25","article-title":"Scaling language-image pre-training via maskin","author":"Li","year":"2023","journal-title":"CVPR"},{"key":"ref26","article-title":"Microsoft coco: Common objects in contex","author":"Lin","year":"2014","journal-title":"ECCV"},{"key":"ref27","article-title":"Improved baselines with visual instruction tunin","volume-title":"NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following","author":"Liu","year":"2023"},{"key":"ref28","article-title":"Visual instruction tunin","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref29","author":"Liu","year":"2024","journal-title":"Llava-next: Improved reasoning, ocr, and world knowledge"},{"key":"ref30","article-title":"Clips: An enhanced clip framework for learning with synthetic caption","author":"Liu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4235-6"},{"key":"ref32","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language task","volume":"32","author":"Lu","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref33","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answerin","volume-title":"The 36th Conference on Neural Information Processing Systems (NeurIPS)","author":"Lu","year":"2022"},{"key":"ref34","article-title":"Falcon2\u201311b technical repor","author":"Malartic","year":"2024","journal-title":"arXiv preprint"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"ref36","year":"2024","journal-title":"Introducing Meta Llama 3: The most capable openly available LLM to date"},{"key":"ref37","article-title":"Gpt-4o system car","year":"2024","journal-title":"arXiv preprint"},{"key":"ref38","article-title":"Learning transferable visual models from natural language supervisio","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1833"},{"key":"ref40","article-title":"Eagle: Exploring the design space for multimodal 11 ms with mixture of encoder","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Shi","year":"2025"},{"key":"ref41","first-page":"8317","article-title":"Towards vqa models that can rea","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"Singh","year":"2019"},{"key":"ref42","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of contex","author":"Team","year":"2024","journal-title":"arXiv preprint"},{"key":"ref43","first-page":"87310","article-title":"Cambrian-1: A fully open, vision-centric exploration of multimodal 11 m","volume":"37","author":"Tong","year":"2025","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00914"},{"key":"ref45","article-title":"How many unicorns are in this image? a safety evaluation benchmark for vision 11 m","author":"Tu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref46","article-title":"Scaling laws in patchification: An image is worth 50,176 tokens and mor","author":"Wang","year":"2025","journal-title":"ICML"},{"key":"ref47","article-title":"Demystifying clip dat","author":"Xu","year":"2023","journal-title":"ICLR"},{"key":"ref48","article-title":"Qwen2.5 technical repor","author":"Yang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref49","article-title":"Coca: Contrastive captioners are image-text foundation model","author":"Yu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref50","article-title":"Mm -vet: Evaluating large multimodal models for integrated capabilitie","volume-title":"ICML","author":"Yu","year":"2024"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.51"},{"key":"ref53","article-title":"Tuning layernorm in attention: Towards efficient multi-modal 11 m finetunin","author":"Zhao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref54","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language model","author":"Zhu","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444383.pdf?arnumber=11444383","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:11:50Z","timestamp":1777529510000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444383\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":54,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00379","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}