{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T06:20:29Z","timestamp":1774419629875,"version":"3.50.1"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icassp49660.2025.10889214","type":"proceedings-article","created":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T17:15:19Z","timestamp":1741799719000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["MixSense : Mixture of Vision Sense"],"prefix":"10.1109","author":[{"given":"Jian","family":"Lin","sequence":"first","affiliation":[{"name":"Research &amp; Innovation Institute, China Mobile (Zhejiang),Hangzhou,China"}]},{"given":"Zhuoran","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computing and Information, University of Pittsburgh,Pittsburgh,The United States"}]},{"given":"Qibo","family":"Qiu","sequence":"additional","affiliation":[{"name":"Research &amp; Innovation Institute, China Mobile (Zhejiang),Hangzhou,China"}]},{"given":"Jianzhong","family":"Chen","sequence":"additional","affiliation":[{"name":"Research &amp; Innovation Institute, China Mobile (Zhejiang),Hangzhou,China"}]},{"given":"Zixian","family":"Ge","sequence":"additional","affiliation":[{"name":"Research &amp; Innovation Institute, China Mobile (Zhejiang),Hangzhou,China"}]},{"given":"Weizhong","family":"Jin","sequence":"additional","affiliation":[{"name":"Research &amp; Innovation Institute, China Mobile (Zhejiang),Hangzhou,China"}]},{"given":"Yuchao","family":"Yan","sequence":"additional","affiliation":[{"name":"Research &amp; Innovation Institute, China Mobile (Zhejiang),Hangzhou,China"}]},{"given":"Li","family":"Yu","sequence":"additional","affiliation":[{"name":"Research &amp; Innovation Institute, China Mobile (Zhejiang),Hangzhou,China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Visual instruction tuning","author":"Liu","year":"2023"},{"key":"ref2","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"NeurIPS"},{"issue":"1","key":"ref3","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"The Journal of Machine Learning Research"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"ref5","article-title":"Opt: Open pre-trained transformer language models","author":"Zhang","year":"2022"},{"key":"ref6","article-title":"Bloom: A 176b-parameter open-access multilingual language model","author":"Workshop","year":"2022"},{"issue":"240","key":"ref7","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"ref8","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref9","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref10","article-title":"Shikra: Unleashing multimodal llm\u2019s referential dialogue magic","author":"Chen","year":"2023"},{"key":"ref11","article-title":"Llavar: Enhanced visual instruction tuning for text-rich image understanding","author":"Zhang","year":"2023"},{"key":"ref12","article-title":"Ferret: Refer and ground anything anywhere at any granularity","author":"You","year":"2023"},{"key":"ref13","article-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023"},{"key":"ref14","article-title":"Minigpt-v2: large language model as a unified interface for vision-language multi-task learning","author":"Chen","year":"2023"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"ref17","article-title":"Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models","author":"Lin","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02484"},{"key":"ref19","article-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","volume-title":"website","author":"Chiang","year":"2023"},{"key":"ref20","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref21","first-page":"26183","article-title":"You only look at one sequence: Rethinking transformer in vision through object detection","volume":"34","author":"Fang","year":"2021","journal-title":"NeurIPS"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72946-1_23"},{"key":"ref25","article-title":"Griffon v2: Advancing multimodal perception with high-resolution scaling and visual-language co-referring","author":"Zhan","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00380"},{"key":"ref29","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","author":"Lu","year":"2022","journal-title":"NeurIPS"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref32","article-title":"Mm-vet: Evaluating large multimodal models for integrated capabilities","author":"Yu","year":"2024","journal-title":"ICML"},{"key":"ref33","article-title":"Seed-bench: Benchmarking multimodal llms with generative comprehension","author":"Li","year":"2023"}],"event":{"name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Hyderabad, India","start":{"date-parts":[[2025,4,6]]},"end":{"date-parts":[[2025,4,11]]}},"container-title":["ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10887540\/10887541\/10889214.pdf?arnumber=10889214","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T05:23:50Z","timestamp":1774416230000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10889214\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/icassp49660.2025.10889214","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}