{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T22:35:09Z","timestamp":1769034909641,"version":"3.49.0"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T00:00:00Z","timestamp":1763078400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T00:00:00Z","timestamp":1763078400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62576213"],"award-info":[{"award-number":["62576213"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,11,14]]},"DOI":"10.1109\/cloudcom67567.2025.11331451","type":"proceedings-article","created":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T20:37:16Z","timestamp":1768941436000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["All Seeing Eyes: A Native-Resolution Vision-Language Framework for High-Fidelity Remote Sensing Image Understanding"],"prefix":"10.1109","author":[{"given":"Jingrui","family":"Zhang","sequence":"first","affiliation":[{"name":"Shenzhen MSU-BIT University,Guangdong-Hong Kong-Macao Joint Laboratory for Emotional Intelligence and Pervasive Computing,Shenzhen,China"}]},{"given":"Yong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University,Guangdong-Hong Kong-Macao Joint Laboratory for Emotional Intelligence and Pervasive Computing,Shenzhen,China"}]},{"given":"Yimeng","family":"Xu","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University,Guangdong-Hong Kong-Macao Joint Laboratory for Emotional Intelligence and Pervasive Computing,Shenzhen,China"}]},{"given":"Zixuan","family":"Shangguan","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University,Guangdong-Hong Kong-Macao Joint Laboratory for Emotional Intelligence and Pervasive Computing,Shenzhen,China"}]},{"given":"Lijie","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shandong Kerui Petroleum &#x0026; Gas Equipment Co., Ltd.,Dongying,China"}]},{"given":"Lihao","family":"Yang","sequence":"additional","affiliation":[{"name":"CCCC. Tianjin Dredging Co., Ltd.,Tianjin,China"}]},{"given":"Yang","family":"Zhou","sequence":"additional","affiliation":[{"name":"Nova Stella (Shenzhen) Technology Co., Ltd.,Shenzhen,China"}]},{"given":"Xiaoyi","family":"Fan","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University,Guangdong-Hong Kong-Macao Joint Laboratory for Emotional Intelligence and Pervasive Computing,Shenzhen,China"}]},{"given":"Feng","family":"Liang","sequence":"additional","affiliation":[{"name":"Shenzhen MSU-BIT University,Guangdong-Hong Kong-Macao Joint Laboratory for Emotional Intelligence and Pervasive Computing,Shenzhen,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3505244"},{"key":"ref2","first-page":"2252","article-title":"Patch n\u2019 pack: Navit, a vision transformer for any aspect ratio and resolution","volume":"36","author":"Dehghani","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.3390\/s23052385"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"ref5","article-title":"Law of vision representation in mllms","author":"Yang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref6","article-title":"Visual representation alignment for multimodal large language models","author":"Yoon","year":"2025","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"Univitar: Unified vision transformer with native resolution","author":"Qiao","year":"2025","journal-title":"arXiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.3390\/rs13030516"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18494\/SAM5345"},{"key":"ref10","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref11","article-title":"Meta learning to bridge vision and language models for multimodal few-shot learning","author":"Najdenkoska","year":"2023","journal-title":"arXiv preprint"},{"key":"ref12","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning","author":"Li","year":"2023"},{"key":"ref13","first-page":"34892","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2023","journal-title":"Advances in neural information processing systems"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CITS.2016.7546397"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776321"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3201474"},{"issue":"8","key":"ref17","article-title":"Rs5m: A large scale vision-language dataset for remote sensing vision-language foundation model","volume":"2","author":"Zhang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3390838"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2025.03.028"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2023.3250471"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01393"},{"key":"ref22","first-page":"18893","article-title":"Pix2struct: Screenshot parsing as pretraining for visual language understanding","volume-title":"International Conference on Machine Learning","author":"Lee","year":"2023"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"ref25","first-page":"2252","article-title":"Patch n\u2019pack: Navit, a vision transformer for any aspect ratio and resolution","volume":"36","author":"Dehghani","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref26","article-title":"Fixing the train-test resolution discrepancy","volume":"32","author":"Touvron","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref27","article-title":"Pali: A jointly-scaled multilingual language-image model","author":"Chen","year":"2022","journal-title":"arXiv preprint"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00023"},{"key":"ref29","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"key":"ref30","article-title":"Qwen2. 5-vl technical report","author":"Bai","year":"2025","journal-title":"arXiv preprint"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2025.01.020"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/mgrs.2020.3005751"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01953"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776321"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/1869790.1869829"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776321"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00418"},{"key":"ref38","volume-title":"Remote sensing vqa - low resolution (rsvqa lr)","author":"Lobry","year":"2022"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2685945"}],"event":{"name":"2025 lEEE International Conference on Cloud Computing Technology and Science (CloudCom)","location":"Shenzhen, China","start":{"date-parts":[[2025,11,14]]},"end":{"date-parts":[[2025,11,16]]}},"container-title":["2025 lEEE International Conference on Cloud Computing Technology and Science (CloudCom)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11330195\/11331311\/11331451.pdf?arnumber=11331451","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T07:18:44Z","timestamp":1768979924000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11331451\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,14]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/cloudcom67567.2025.11331451","relation":{},"subject":[],"published":{"date-parts":[[2025,11,14]]}}}