{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T16:40:53Z","timestamp":1778258453950,"version":"3.51.4"},"reference-count":132,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1007\/s11432-024-4231-5","type":"journal-article","created":{"date-parts":[[2024,12,19]],"date-time":"2024-12-19T02:51:48Z","timestamp":1734576708000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":351,"title":["How far are we to GPT-4V? Closing the gap to commercial multimodal models with open-source suites"],"prefix":"10.1007","volume":"67","author":[{"given":"Zhe","family":"Chen","sequence":"first","affiliation":[]},{"given":"Weiyun","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Shenglong","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Zhangwei","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Erfei","family":"Cui","sequence":"additional","affiliation":[]},{"given":"Wenwen","family":"Tong","sequence":"additional","affiliation":[]},{"given":"Kongzhi","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Jiapeng","family":"Luo","sequence":"additional","affiliation":[]},{"given":"Zheng","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Ji","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Jiaqi","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xiaoyi","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Hang","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Hewei","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Conghui","family":"He","sequence":"additional","affiliation":[]},{"given":"Botian","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Zhenjiang","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Chao","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Bin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xingjian","family":"Wei","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Li","sequence":"additional","affiliation":[]},{"given":"Wenjian","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Pinlong","family":"Cai","sequence":"additional","affiliation":[]},{"given":"Licheng","family":"Wen","sequence":"additional","affiliation":[]},{"given":"Xiangchao","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Min","family":"Dou","sequence":"additional","affiliation":[]},{"given":"Lewei","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Xizhou","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Tong","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Jifeng","family":"Dai","sequence":"additional","affiliation":[]},{"given":"Wenhai","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,13]]},"reference":[{"key":"4231_CR1","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"W H Wang","year":"2024","unstructured":"Wang W H, Chen Z, Chen X K, et al. VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. In: Proceedings of Advances in Neural Information Processing Systems, 2024"},{"key":"4231_CR2","volume-title":"Qwen-VL: a Frontier large vision-language model with versatile abilities","author":"J Z Bai","year":"2023","unstructured":"Bai J Z, Bai S, Yang S S, et al. Qwen-VL: a Frontier large vision-language model with versatile abilities. 2023. ArXiv:2308.12966"},{"key":"4231_CR3","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"H T Liu","year":"2024","unstructured":"Liu H T, Li C Y, Wu Q Y, et al. Visual instruction tuning. In: Proceedings of Advances in Neural Information Processing Systems, 2024"},{"key":"4231_CR4","first-page":"26296","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"H T Liu","year":"2024","unstructured":"Liu H T, Li C Y, Li Y H, et al. Improved baselines with visual instruction tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 26296\u201326306"},{"key":"4231_CR5","volume-title":"Proceedings of the 12th International Conference on Learning Representations","author":"D Y Zhu","year":"2024","unstructured":"Zhu D Y, Chen J, Shen X Q, et al. MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: Proceedings of the 12th International Conference on Learning Representations, 2024"},{"key":"4231_CR6","first-page":"24185","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Z Chen","year":"2024","unstructured":"Chen Z, Wu J N, Wang W H, et al. InternVL: scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 24185\u201324198"},{"key":"4231_CR7","volume-title":"MM1: methods, analysis & insights from multimodal LLM pre-training","author":"B McKinzie","year":"2024","unstructured":"McKinzie B, Gan Z, Fauconnier J P, et al. MM1: methods, analysis & insights from multimodal LLM pre-training. 2024. ArXiv:2403.09611"},{"key":"4231_CR8","volume-title":"InternLM-XComposer2: mastering free-form text-image composition and comprehension in vision-language large model","author":"X Y Dong","year":"2024","unstructured":"Dong X Y, Zhang P, Zang Y H, et al. InternLM-XComposer2: mastering free-form text-image composition and comprehension in vision-language large model. 2024. ArXiv:2401.16420"},{"key":"4231_CR9","volume-title":"Gemini 1.5: unlocking multimodal understanding across millions of tokens of context","author":"M Reid","year":"2024","unstructured":"Reid M, Savinov N, Teplyashin D, et al. Gemini 1.5: unlocking multimodal understanding across millions of tokens of context. 2024. ArXiv:2403.05530"},{"key":"4231_CR10","volume-title":"Gemini: a family of highly capable multimodal models","author":"G Team","year":"2023","unstructured":"Team G, Anil R, Borgeaud S, et al. Gemini: a family of highly capable multimodal models. 2023. ArXiv:2312.11805"},{"key":"4231_CR11","volume-title":"CogVLM: visual expert for pretrained language models","author":"W H Wang","year":"2023","unstructured":"Wang W H, Lv Q S, Yu W M, et al. CogVLM: visual expert for pretrained language models. 2023. ArXiv:2311.03079"},{"key":"4231_CR12","volume-title":"DeepSeek-VL: towards real-world vision-language understanding","author":"H Y Lu","year":"2024","unstructured":"Lu H Y, Liu W, Zhang B, et al. DeepSeek-VL: towards real-world vision-language understanding. 2024. ArXiv:2403.05525"},{"key":"4231_CR13","volume-title":"LLaVA-NeXT: improved reasoning, OCR, and world knowledge","author":"H T Liu","year":"2024","unstructured":"Liu H T, Li C Y, Li Y H, et al. LLaVA-NeXT: improved reasoning, OCR, and world knowledge. 2024. https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"4231_CR14","volume-title":"InternLM2 technical report","author":"Z Cai","year":"2024","unstructured":"Cai Z, Cao M, Chen H J, et al. InternLM2 technical report. 2024. ArXiv:2403.17297"},{"key":"4231_CR15","first-page":"8317","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"A Singh","year":"2019","unstructured":"Singh A, Natarajan V, Shah M, et al. Towards VQA models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019. 8317\u20138326"},{"key":"4231_CR16","first-page":"2263","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"A Masry","year":"2022","unstructured":"Masry A, Do X L, Tan J Q, et al. ChartQA: a benchmark for question answering about charts with visual and logical reasoning. In: Proceedings of Findings of the Association for Computational Linguistics, 2022. 2263\u20132279"},{"key":"4231_CR17","first-page":"2200","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"M Mathew","year":"2021","unstructured":"Mathew M, Karatzas D, Jawahar C V, et al. DocVQA: a dataset for VQA on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2021. 2200\u20132209"},{"key":"4231_CR18","volume-title":"LLaMA: open and efficient foundation language models","author":"H Touvron","year":"2023","unstructured":"Touvron H, Lavril T, Izacard G, et al. LLaMA: open and efficient foundation language models. 2023. ArXiv:2302.13971"},{"key":"4231_CR19","first-page":"24824","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"J Wei","year":"2022","unstructured":"Wei J, Wang X Z, Schuurmans D, et al. Chain-of-thought prompting elicits reasoning in large language models. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 24824\u201324837"},{"key":"4231_CR20","volume-title":"LLaMA 2: open foundation and fine-tuned chat models","author":"H Touvron","year":"2023","unstructured":"Touvron H, Martin L, Stone K, et al. LLaMA 2: open foundation and fine-tuned chat models. 2023. ArXiv:2307.09288"},{"key":"4231_CR21","first-page":"46595","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"L M Zheng","year":"2023","unstructured":"Zheng L M, Chiang W L, Sheng Y, et al. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. In: Proceedings of Advances in Neural Information Processing Systems, 2023. 46595\u201346623"},{"key":"4231_CR22","volume-title":"MOSS: training conversational language models from synthetic data","author":"T X Sun","year":"2023","unstructured":"Sun T X, Zhang X T, He Z F, et al. MOSS: training conversational language models from synthetic data. 2023. ArXiv:2307.15020"},{"key":"4231_CR23","first-page":"320","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics","author":"Z X Du","year":"2022","unstructured":"Du Z X, Qian Y J, Liu X, et al. GLM: general language model pretraining with autoregressive blank infilling. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics, 2022. 320\u2013335"},{"key":"4231_CR24","volume-title":"Qwen technical report","author":"J Z Bai","year":"2023","unstructured":"Bai J Z, Bai S, Chu Y F, et al. Qwen technical report. 2023. ArXiv:2309.16609"},{"key":"4231_CR25","volume-title":"Baichuan 2: open large-scale language models","author":"A Y Yang","year":"2023","unstructured":"Yang A Y, Xiao B, Wang B N, et al. Baichuan 2: open large-scale language models. 2023. ArXiv:2309.10305"},{"key":"4231_CR26","volume-title":"Skywork: a more open bilingual foundation model","author":"T W Wei","year":"2023","unstructured":"Wei T W, Zhao L, Zhang L C, et al. Skywork: a more open bilingual foundation model. 2023. ArXiv:2310.19341"},{"key":"4231_CR27","volume-title":"DeepSeek LLM: scaling open-source language models with longtermism","author":"X Bi","year":"2024","unstructured":"Bi X, Chen D, Chen G T, et al. DeepSeek LLM: scaling open-source language models with longtermism. 2024. ArXiv:2401.02954"},{"key":"4231_CR28","volume-title":"LLaMA-adapter: efficient fine-tuning of language models with zero-init attention","author":"R R Zhang","year":"2023","unstructured":"Zhang R R, Han J M, Liu C, et al. LLaMA-adapter: efficient fine-tuning of language models with zero-init attention. 2023. ArXiv:2303.16199"},{"key":"4231_CR29","volume-title":"GPT4RoI: instruction tuning large language model on region-of-interest","author":"S L Zhang","year":"2023","unstructured":"Zhang S L, Sun P Z, Chen S F, et al. GPT4RoI: instruction tuning large language model on region-of-interest. 2023. ArXiv:2307.03601"},{"key":"4231_CR30","volume-title":"Next-GPT: any-to-any multimodal LLM","author":"S Q Wu","year":"2023","unstructured":"Wu S Q, Fei H, Qu L G, et al. Next-GPT: any-to-any multimodal LLM. 2023. ArXiv:2309.05519"},{"key":"4231_CR31","volume-title":"Proceedings of the 12th International Conference on Learning Representations","author":"Q Sun","year":"2023","unstructured":"Sun Q, Yu Q Y, Cui Y F, et al. Emu: generative pretraining in multimodality. In: Proceedings of the 12th International Conference on Learning Representations, 2023"},{"key":"4231_CR32","first-page":"23716","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"J B Alayrac","year":"2022","unstructured":"Alayrac J B, Donahue J, Luc P, et al. Flamingo: a visual language model for few-shot learning. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 23716\u201323736"},{"key":"4231_CR33","volume-title":"LISA: reasoning segmentation via large language model","author":"X Lai","year":"2023","unstructured":"Lai X, Tian Z T, Chen Y K, et al. LISA: reasoning segmentation via large language model. 2023. ArXiv:2308.00692"},{"key":"4231_CR34","volume-title":"Otter: a multi-modal model with in-context instruction tuning","author":"B Li","year":"2023","unstructured":"Li B, Zhang Y H, Chen L Y, et al. Otter: a multi-modal model with in-context instruction tuning. 2023. ArXiv:2305.03726"},{"key":"4231_CR35","first-page":"26763","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Z Li","year":"2024","unstructured":"Li Z, Yang B, Liu Q, et al. Monkey: image resolution and text label are important things for large multi-modal models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 26763\u201326773"},{"key":"4231_CR36","volume-title":"VideoChat: chat-centric video understanding","author":"K C Li","year":"2023","unstructured":"Li K C, He Y N, Wang Y, et al. VideoChat: chat-centric video understanding. 2023. ArXiv:2305.06355"},{"key":"4231_CR37","volume-title":"MoE-LLaVA: mixture of experts for large vision-language models","author":"B Lin","year":"2024","unstructured":"Lin B, Tang Z Y, Ye Y, et al. MoE-LLaVA: mixture of experts for large vision-language models. 2024. ArXiv:2401.15947"},{"key":"4231_CR38","volume-title":"InternGPT: solving vision-centric tasks by interacting with ChatGPT beyond language","author":"Z Y Liu","year":"2023","unstructured":"Liu Z Y, He Y N, Wang W H, et al. InternGPT: solving vision-centric tasks by interacting with ChatGPT beyond language. 2023. ArXiv:2305.05662"},{"key":"4231_CR39","volume-title":"ControlLLM: augment language models with tools by searching on graphs","author":"Z Y Liu","year":"2023","unstructured":"Liu Z Y, Lai Z Q, Gao Z W, et al. ControlLLM: augment language models with tools by searching on graphs. 2023. ArXiv:2310.17796"},{"key":"4231_CR40","volume-title":"MM-interleaved: interleaved image-text generative modeling via multi-modal feature synchronizer","author":"C Y Tian","year":"2024","unstructured":"Tian C Y, Zhu X Z, Xiong Y W, et al. MM-interleaved: interleaved image-text generative modeling via multi-modal feature synchronizer. 2024. ArXiv:2401.10208"},{"key":"4231_CR41","volume-title":"The all-seeing project V2: towards general relation comprehension of the open world","author":"W Y Wang","year":"2024","unstructured":"Wang W Y, Ren Y M, Luo H W, et al. The all-seeing project V2: towards general relation comprehension of the open world. 2024. ArXiv:2402.19474"},{"key":"4231_CR42","volume-title":"InternVideo2: scaling video foundation models for multimodal video understanding","author":"Y Wang","year":"2024","unstructured":"Wang Y, Li K C, Li X H, et al. InternVideo2: scaling video foundation models for multimodal video understanding. 2024. ArXiv:2403.15377"},{"key":"4231_CR43","volume-title":"VideoLLM: modeling video sequence with large language models","author":"G Chen","year":"2023","unstructured":"Chen G, Zheng Y D, Wang J H, et al. VideoLLM: modeling video sequence with large language models. 2023. ArXiv:2305.13292"},{"key":"4231_CR44","volume-title":"Shikra: unleashing multimodal LLM\u2019s referential dialogue magic","author":"K Q Chen","year":"2023","unstructured":"Chen K Q, Zhang Z, Zeng W L, et al. Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. 2023. ArXiv:2306.15195"},{"key":"4231_CR45","volume-title":"Proceedings of the 12th International Conference on Learning Representations","author":"W Y Wang","year":"2024","unstructured":"Wang W Y, Shi M, Li Q Y, et al. The all-seeing project: towards panoptic visual recognition and understanding of the open world. In: Proceedings of the 12th International Conference on Learning Representations, 2024"},{"key":"4231_CR46","volume-title":"Kosmos-2: grounding multimodal large language models to the world","author":"Z L Peng","year":"2023","unstructured":"Peng Z L, Wang W H, Dong L, et al. Kosmos-2: grounding multimodal large language models to the world. 2023. ArXiv:2306.14824"},{"key":"4231_CR47","volume-title":"CogAgent: a visual language model for GUI agents","author":"W Y Hong","year":"2023","unstructured":"Hong W Y, Wang W H, Lv Q S, et al. CogAgent: a visual language model for GUI agents. 2023. ArXiv:2312.08914"},{"key":"4231_CR48","volume-title":"KOSMOS-2.5: a multimodal literate model","author":"T C Lv","year":"2023","unstructured":"Lv T C, Huang Y P, Chen J Y, et al. KOSMOS-2.5: a multimodal literate model. 2023. ArXiv:2309.11419"},{"key":"4231_CR49","volume-title":"Vary: scaling up the vision vocabulary for large vision-language models","author":"H R Wei","year":"2023","unstructured":"Wei H R, Kong L Y, Chen J Y, et al. Vary: scaling up the vision vocabulary for large vision-language models. 2023. ArXiv:2312.06109"},{"key":"4231_CR50","volume-title":"Feast your eyes: mixture-of-resolution adaptation for multimodal large language models","author":"G Luo","year":"2024","unstructured":"Luo G, Zhou Y Y, Zhang Y X, et al. Feast your eyes: mixture-of-resolution adaptation for multimodal large language models. 2024. ArXiv:2403.03003"},{"key":"4231_CR51","volume-title":"Mini-Gemini: mining the potential of multi-modality vision language models","author":"Y W Li","year":"2024","unstructured":"Li Y W, Zhang Y C, Wang C Y, et al. Mini-Gemini: mining the potential of multi-modality vision language models. 2024. ArXiv:2403.18814"},{"key":"4231_CR52","volume-title":"mPLUG-DocOwl 1.5: unified structure learning for OCR-free document understanding","author":"A W Hu","year":"2024","unstructured":"Hu A W, Xu H Y, Ye J B, et al. mPLUG-DocOwl 1.5: unified structure learning for OCR-free document understanding. 2024. ArXiv:2403.12895"},{"key":"4231_CR53","volume-title":"OtterHD: a high-resolution multi-modality model","author":"B Li","year":"2023","unstructured":"Li B, Zhang P Y, Yang J K, et al. OtterHD: a high-resolution multi-modality model. 2023. ArXiv:2311.04219"},{"key":"4231_CR54","volume-title":"SPHINX: the joint mixing of weights, tasks, and visual embeddings for multi-modal large language models","author":"Z Y Lin","year":"2023","unstructured":"Lin Z Y, Liu C, Zhang R R, et al. SPHINX: the joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. 2023. ArXiv:2311.07575"},{"key":"4231_CR55","volume-title":"TextMonkey: an OCR-free large multimodal model for understanding document","author":"Y L Liu","year":"2024","unstructured":"Liu Y L, Yang B, Liu Q, et al. TextMonkey: an OCR-free large multimodal model for understanding document. 2024. ArXiv:2403.04473"},{"key":"4231_CR56","volume-title":"LLaVA-UHD: an LMM perceiving any aspect ratio and high-resolution images","author":"R Y Xu","year":"2024","unstructured":"Xu R Y, Yao Y, Guo Z H, et al. LLaVA-UHD: an LMM perceiving any aspect ratio and high-resolution images. 2024. ArXiv:2403.11703"},{"key":"4231_CR57","volume-title":"UReader: universal OCR-free visually-situated language understanding with multimodal large language model","author":"J B Ye","year":"2023","unstructured":"Ye J B, Hu A W, Xu H Y, et al. UReader: universal OCR-free visually-situated language understanding with multimodal large language model. 2023. ArXiv:2310.05126"},{"key":"4231_CR58","volume-title":"InternLM-XComposer2-4KHD: a pioneering large vision-language model handling resolutions from 336 pixels to 4K HD","author":"X Y Dong","year":"2024","unstructured":"Dong X Y, Zhang P, Zang Y H, et al. InternLM-XComposer2-4KHD: a pioneering large vision-language model handling resolutions from 336 pixels to 4K HD. 2024. ArXiv:2404.06512"},{"key":"4231_CR59","first-page":"8748","volume-title":"Proceedings of the International Conference on Machine Learning","author":"A Radford","year":"2021","unstructured":"Radford A, Kim J W, Hallacy C, et al. Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning, 2021. 8748\u20138763"},{"key":"4231_CR60","doi-asserted-by":"publisher","first-page":"11975","DOI":"10.1007\/978-3-030-96530-3","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"X H Zhai","year":"2023","unstructured":"Zhai X H, Mustafa B, Kolesnikov A, et al. Sigmoid loss for language image pre-training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023. 11975\u201311986"},{"key":"4231_CR61","first-page":"9568","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"S B Tong","year":"2024","unstructured":"Tong S B, Liu Z, Zhai Y X, et al. Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 9568\u20139578"},{"key":"4231_CR62","volume-title":"DINOv2: learning robust visual features without supervision","author":"M Oquab","year":"2024","unstructured":"Oquab M, Darcet T, Moutakanni T, et al. DINOv2: learning robust visual features without supervision. 2024. ArXiv:2304.07193"},{"key":"4231_CR63","volume-title":"Groma: localized visual tokenization for grounding multimodal large language models","author":"C F Ma","year":"2024","unstructured":"Ma C F, Jiang Y, Wu J N, et al. Groma: localized visual tokenization for grounding multimodal large language models. 2024. ArXiv:2404.13013"},{"key":"4231_CR64","volume-title":"Yi: open foundation models by 01.AI","author":"A Young","year":"2024","unstructured":"Young A, Chen B, Li C, et al. Yi: open foundation models by 01.AI. 2024. ArXiv:2403.04652"},{"key":"4231_CR65","first-page":"25278","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann C, Beaumont R, Vencu R, et al. LAION-5B: an open large-scale dataset for training next generation image-text models. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 25278\u201325294"},{"key":"4231_CR66","volume-title":"COYO-700M: image-text pair dataset","author":"M Byeon","year":"2022","unstructured":"Byeon M, Park B, Kim H, et al. COYO-700M: image-text pair dataset. 2022. https:\/\/github.com\/kakaobrain\/coyo-dataset"},{"key":"4231_CR67","volume-title":"Microsoft COCO captions: data collection and evaluation server","author":"X Chen","year":"2015","unstructured":"Chen X, Fang H, Lin T Y, et al. Microsoft COCO captions: data collection and evaluation server. 2015. ArXiv:1504.00325"},{"key":"4231_CR68","first-page":"26418","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"J X Gu","year":"2022","unstructured":"Gu J X, Meng X J, Lu G S, et al. Wukong: a 100 million large-scale Chinese cross-modal pre-training benchmark. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 26418\u201326431"},{"key":"4231_CR69","first-page":"742","volume-title":"Proceedings of the 16th European Conference on Computer Vision","author":"O Sidorov","year":"2020","unstructured":"Sidorov O, Hu R H, Rohrbach M, et al. TextCaps: a dataset for image captioning with reading comprehension. In: Proceedings of the 16th European Conference on Computer Vision, 2020. 742\u2013758"},{"key":"4231_CR70","first-page":"8430","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"S Shao","year":"2019","unstructured":"Shao S, Li Z M, Zhang T Y, et al. Objects365: a large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019. 8430\u20138439"},{"key":"4231_CR71","first-page":"1287","volume-title":"Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"F X Liu","year":"2024","unstructured":"Liu F X, Wang X Y, Yao W L, et al. MMC: advancing multimodal chart understanding with large-scale instruction tuning. In: Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, 2024. 1287\u20131310"},{"key":"4231_CR72","first-page":"1557","volume-title":"Proceedings of the International Conference on Document Analysis and Recognition","author":"Y P Sun","year":"2019","unstructured":"Sun Y P, Ni Z H, Chng C K, et al. ICDAR 2019 competition on large-scale street view text with partial labeling-RRC-LSVT. In: Proceedings of the International Conference on Document Analysis and Recognition, 2019. 1557\u20131562"},{"key":"4231_CR73","first-page":"4291","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"A F Biten","year":"2019","unstructured":"Biten A F, Tito R, Mafla A, et al. Scene text visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019. 4291\u20134301"},{"key":"4231_CR74","first-page":"1429","volume-title":"Proceedings of the 14th IAPR International Conference on Document Analysis and Recognition","author":"B G Shi","year":"2017","unstructured":"Shi B G, Yao C, Liao M H, et al. ICDAR2017 competition on reading Chinese text in the wild (RCTW-17). In: Proceedings of the 14th IAPR International Conference on Document Analysis and Recognition, 2017. 1429\u20131434"},{"key":"4231_CR75","first-page":"1577","volume-title":"Proceedings of the International Conference on Document Analysis and Recognition","author":"R Zhang","year":"2019","unstructured":"Zhang R, Zhou Y S, Jiang Q Y, et al. ICDAR 2019 robust reading challenge on reading Chinese text on signboard. In: Proceedings of the International Conference on Document Analysis and Recognition, 2019. 1577\u20131581"},{"key":"4231_CR76","first-page":"1571","volume-title":"Proceedings of the International Conference on Document Analysis and Recognition","author":"C K Chng","year":"2019","unstructured":"Chng C K, Liu Y L, Sun Y P, et al. ICDAR2019 robust reading challenge on arbitrary-shaped text-RRC-ART. In: Proceedings of the International Conference on Document Analysis and Recognition, 2019. 1571\u20131576"},{"key":"4231_CR77","first-page":"498","volume-title":"Proceedings of the European Conference on Computer Vision","author":"G Kim","year":"2022","unstructured":"Kim G, Hong T G, Yim M B, et al. OCR-free document understanding transformer. In: Proceedings of the European Conference on Computer Vision, 2022. 498\u2013517"},{"key":"4231_CR78","doi-asserted-by":"publisher","first-page":"509","DOI":"10.1007\/s11390-019-1923-y","volume":"34","author":"T L Yuan","year":"2019","unstructured":"Yuan T L, Zhu Z, Xu K, et al. A large Chinese text dataset in the wild. J Comput Sci Technol, 2019, 34: 509\u2013521","journal-title":"J Comput Sci Technol"},{"key":"4231_CR79","volume-title":"COCO-Text: dataset and benchmark for text detection and recognition in natural images","author":"A Veit","year":"2016","unstructured":"Veit A, Matera T, Neumann L, et al. COCO-Text: dataset and benchmark for text detection and recognition in natural images. 2016. ArXiv:1601.07140"},{"key":"4231_CR80","first-page":"1527","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"N Methani","year":"2020","unstructured":"Methani N, Ganguly P, Khapra M M, et al. PlotQA: reasoning over scientific plots. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2020. 1527\u20131536"},{"key":"4231_CR81","first-page":"8802","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"A Singh","year":"2021","unstructured":"Singh A, Pang G, Toh M, et al. TextOCR: towards large-scale end-to-end reasoning for arbitrary-shaped scene text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021. 8802\u20138812"},{"key":"4231_CR82","first-page":"1697","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"M Mathew","year":"2022","unstructured":"Mathew M, Bagal V, Tito R, et al. InfographicVQA. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2022. 1697\u20131706"},{"key":"4231_CR83","volume-title":"PP-OCRv3: more attempts for the improvement of ultra lightweight OCR system","author":"C X Li","year":"2022","unstructured":"Li C X, Liu W W, Guo R Y, et al. PP-OCRv3: more attempts for the improvement of ultra lightweight OCR system. 2022. ArXiv:2206.03001"},{"key":"4231_CR84","volume-title":"ShareGPT4V: improving large multi-modal models with better captions","author":"L Chen","year":"2023","unstructured":"Chen L, Li J S, Dong X Y, et al. ShareGPT4V: improving large multi-modal models with better captions. 2023. ArXiv:2311.12793"},{"key":"4231_CR85","first-page":"6904","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Y Goyal","year":"2017","unstructured":"Goyal Y, Khot T, Summers-Stay D, et al. Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017. 6904\u20136913"},{"key":"4231_CR86","first-page":"6700","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"D A Hudson","year":"2019","unstructured":"Hudson D A, Manning C D. GQA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019. 6700\u20136709"},{"key":"4231_CR87","first-page":"3195","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"K Marino","year":"2019","unstructured":"Marino K, Rastegari M, Farhadi A, et al. OK-VQA: a visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019. 3195\u20133204"},{"key":"4231_CR88","doi-asserted-by":"publisher","first-page":"635","DOI":"10.1162\/tacl_a_00566","volume":"11","author":"F Liu","year":"2023","unstructured":"Liu F, Emerson G, Collier N. Visual spatial reasoning. Trans Assoc Comput Linguist, 2023, 11: 635\u2013651","journal-title":"Trans Assoc Comput Linguist"},{"key":"4231_CR89","first-page":"326","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"A Das","year":"2017","unstructured":"Das A, Kottur S, Gupta K, et al. Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017. 326\u2013335"},{"key":"4231_CR90","first-page":"235","volume-title":"Proceedings of the European Conference on Computer Vision","author":"A Kembhavi","year":"2016","unstructured":"Kembhavi A, Salvato M, Kolve E, et al. A diagram is worth a dozen images. In: Proceedings of the European Conference on Computer Vision, 2016. 235\u2013251"},{"key":"4231_CR91","first-page":"2507","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"P Lu","year":"2022","unstructured":"Lu P, Mishra S, Xia T L, et al. Learn to explain: multimodal reasoning via thought chains for science question answering. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 2507\u20132521"},{"key":"4231_CR92","first-page":"4999","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"A Kembhavi","year":"2017","unstructured":"Kembhavi A, Seo M, Schwenk D, et al. Are you smarter than a sixth grader? Textbook question answering for multimodal machine comprehension. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017. 4999\u20135007"},{"key":"4231_CR93","first-page":"5648","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"K Kafle","year":"2018","unstructured":"Kafle K, Price B, Cohen S, et al. DVQA: understanding data visualizations via question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018. 5648\u20135656"},{"key":"4231_CR94","volume-title":"Aligning large multi-modal model with robust instruction tuning","author":"F X Liu","year":"2023","unstructured":"Liu F X, Lin K, Li L J, et al. Aligning large multi-modal model with robust instruction tuning. 2023. ArXiv:2306.14565"},{"key":"4231_CR95","first-page":"1511","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics","author":"J Cao","year":"2022","unstructured":"Cao J, Xiao J. An augmented benchmark dataset for geometric question answering through dual parallel text encoding. In: Proceedings of the 29th International Conference on Computational Linguistics, 2022. 1511\u20131520"},{"key":"4231_CR96","volume-title":"Dynamic prompt learning via policy gradient for semi-structured mathematical reasoning","author":"P Lu","year":"2022","unstructured":"Lu P, Qiu L, Chang K W, et al. Dynamic prompt learning via policy gradient for semi-structured mathematical reasoning. 2022. ArXiv:2209.14610"},{"key":"4231_CR97","volume-title":"Metamath: bootstrap your own mathematical questions for large language models","author":"L H Yu","year":"2023","unstructured":"Yu L H, Jiang W, Shi H, et al. Metamath: bootstrap your own mathematical questions for large language models. 2023. ArXiv:2309.12284"},{"key":"4231_CR98","volume-title":"CLEVR-Math: a dataset for compositional language, visual and mathematical reasoning","author":"A D Lindstr\u00f6m","year":"2022","unstructured":"Lindstr\u00f6m A D, Abraham S S. CLEVR-Math: a dataset for compositional language, visual and mathematical reasoning. 2022. ArXiv:2208.05358"},{"key":"4231_CR99","first-page":"14963","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Z W Li","year":"2023","unstructured":"Li Z W, Wang X R, Stengel-Eskin E, et al. Super-CLEVR: a virtual benchmark to diagnose domain robustness in visual reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023. 14963\u201314973"},{"key":"4231_CR100","volume-title":"Inter-GPS: interpretable geometry problem solving with formal language and symbolic reasoning","author":"P Lu","year":"2021","unstructured":"Lu P, Gong R, Jiang S B, et al. Inter-GPS: interpretable geometry problem solving with formal language and symbolic reasoning. 2021. ArXiv:2105.04165"},{"key":"4231_CR101","first-page":"8876","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"S Shah","year":"2019","unstructured":"Shah S, Mishra A, Yadati N, et al. KVQA: knowledge-aware visual question answering. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2019. 8876\u20138884"},{"key":"4231_CR102","first-page":"146","volume-title":"Proceedings of European Conference on Computer Vision","author":"D Schwenk","year":"2022","unstructured":"Schwenk D, Khandelwal A, Clark C, et al. A-OKVQA: a benchmark for visual question answering using world knowledge. In: Proceedings of European Conference on Computer Vision, 2022. 146\u2013162"},{"key":"4231_CR103","doi-asserted-by":"publisher","first-page":"3108","DOI":"10.1145\/3477495.3531753","volume-title":"Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval","author":"P Lerner","year":"2022","unstructured":"Lerner P, Ferret O, Guinaudeau C, et al. ViQuAE, a dataset for knowledge-based visual question answering about named entities. In: Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, 2022. 3108\u20133120"},{"key":"4231_CR104","volume-title":"Wanjuan: a comprehensive multimodal dataset for advancing English and Chinese large models","author":"C H He","year":"2023","unstructured":"He C H, Jin Z J, Xu C, et al. Wanjuan: a comprehensive multimodal dataset for advancing English and Chinese large models. 2023. ArXiv:2308.10755"},{"key":"4231_CR105","first-page":"947","volume-title":"Proceedings of International Conference on Document Analysis and Recognition","author":"A Mishra","year":"2019","unstructured":"Mishra A, Shekhar S, Singh A K, et al. OCR-VQA: visual question answering by reading text in images. In: Proceedings of International Conference on Document Analysis and Recognition, 2019. 947\u2013952"},{"key":"4231_CR106","volume-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics","author":"C Clark","year":"2018","unstructured":"Clark C, Gardner M. Simple and effective multi-paragraph reading comprehension. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics, 2018"},{"key":"4231_CR107","first-page":"69","volume-title":"Proceedings of the European Conference on Computer Vision","author":"L C Yu","year":"2016","unstructured":"Yu L C, Poirson P, Yang S, et al. Modeling context in referring expressions. In: Proceedings of the European Conference on Computer Vision, 2016. 69\u201385"},{"key":"4231_CR108","first-page":"11","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"J H Mao","year":"2016","unstructured":"Mao J H, Huang J, Toshev A, et al. Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016. 11\u201320"},{"key":"4231_CR109","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O, et al. Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vis, 2017, 123: 32\u201373","journal-title":"Int J Comput Vis"},{"key":"4231_CR110","volume-title":"To see is to believe: prompting GPT-4V for better visual instruction tuning","author":"J K Wang","year":"2023","unstructured":"Wang J K, Meng L C, Weng Z J, et al. To see is to believe: prompting GPT-4V for better visual instruction tuning. 2023. ArXiv:2311.07574"},{"key":"4231_CR111","volume-title":"ALLaVA: harnessing GPT-4V-synthesized data for a lite vision-language model","author":"G H Chen","year":"2024","unstructured":"Chen G H, Chen S N, Zhang R F, et al. ALLaVA: harnessing GPT-4V-synthesized data for a lite vision-language model. 2024. ArXiv:2402.11684"},{"key":"4231_CR112","volume-title":"SVIT: scaling up visual instruction tuning","author":"B Zhao","year":"2023","unstructured":"Zhao B, Wu B, Huang T J, et al. SVIT: scaling up visual instruction tuning. 2023. ArXiv:2307.04087"},{"key":"4231_CR113","first-page":"7","volume":"3","author":"R Taori","year":"2023","unstructured":"Taori R, Gulrajani I, Zhang T Y, et al. Alpaca: a strong, replicable instruction-following model. Stanford Center for Research on Foundation Models, 2023, 3: 7","journal-title":"Stanford Center for Research on Foundation Models"},{"key":"4231_CR114","volume-title":"COIG-CQIA: quality is all you need for Chinese instruction fine-tuning","author":"Y L Bai","year":"2024","unstructured":"Bai Y L, Du X R, Liang Y M, et al. COIG-CQIA: quality is all you need for Chinese instruction fine-tuning. 2024. ArXiv:2403.18058"},{"key":"4231_CR115","doi-asserted-by":"publisher","first-page":"11198","DOI":"10.1145\/3664647.3685520","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia","author":"H D Duan","year":"2024","unstructured":"Duan H D, Yang J M, Qiao Y X, et al. VLMEvalKit: an open-source toolkit for evaluating large multi-modality models. In: Proceedings of the 32nd ACM International Conference on Multimedia, 2024. 11198\u201311201"},{"key":"4231_CR116","volume-title":"On the hidden mystery of OCR in large multimodal models","author":"Y L Liu","year":"2023","unstructured":"Liu Y L, Li Z, Li H L, et al. On the hidden mystery of OCR in large multimodal models. 2023. ArXiv:2305.07895"},{"key":"4231_CR117","volume-title":"MME: a comprehensive evaluation benchmark for multimodal large language models","author":"C Y Fu","year":"2023","unstructured":"Fu C Y, Chen P X, Shen Y H, et al. MME: a comprehensive evaluation benchmark for multimodal large language models. 2023. ArXiv:2306.13394"},{"key":"4231_CR118","first-page":"9556","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"X Yue","year":"2024","unstructured":"Yue X, Ni Y S, Zhang K, et al. MMMU: a massive multi-discipline multimodal understanding and reasoning benchmark for expert AGI. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 9556\u20139567"},{"key":"4231_CR119","volume-title":"MMBench: is your multi-modal model an all-around player?","author":"Y Liu","year":"2023","unstructured":"Liu Y, Duan H D, Zhang Y H, et al. MMBench: is your multi-modal model an all-around player? 2023. ArXiv:2307.06281"},{"key":"4231_CR120","volume-title":"Proceedings of 41st International Conference on Machine Learning","author":"W H Yu","year":"2024","unstructured":"Yu W H, Yang Z Y, Li L J, et al. MM-Vet: evaluating large multimodal models for integrated capabilities. In: Proceedings of 41st International Conference on Machine Learning, 2024"},{"key":"4231_CR121","volume-title":"Seed-bench: benchmarking multimodal LLMs with generative comprehension","author":"B H Li","year":"2023","unstructured":"Li B H, Wang R, Wang G Z, et al. Seed-bench: benchmarking multimodal LLMs with generative comprehension. 2023. ArXiv:2307.16125"},{"key":"4231_CR122","volume-title":"HallusionBench: an advanced diagnostic suite for entangled language hallucination & visual illusion in large vision-language models","author":"T R Guan","year":"2023","unstructured":"Guan T R, Liu F X, Wu X Y, et al. HallusionBench: an advanced diagnostic suite for entangled language hallucination & visual illusion in large vision-language models. 2023. ArXiv:2310.14566"},{"key":"4231_CR123","volume-title":"MathVista: evaluating mathematical reasoning of foundation models in visual contexts","author":"P Lu","year":"2023","unstructured":"Lu P, Bansal H, Xia T, et al. MathVista: evaluating mathematical reasoning of foundation models in visual contexts. 2023. ArXiv:2310.02255"},{"key":"4231_CR124","volume-title":"MMT-Bench: a comprehensive multimodal benchmark for evaluating large vision-language models towards multitask AGI","author":"K Ying","year":"2024","unstructured":"Ying K, Meng F Q, Wang J, et al. MMT-Bench: a comprehensive multimodal benchmark for evaluating large vision-language models towards multitask AGI. 2024. ArXiv:2404.16006"},{"key":"4231_CR125","volume-title":"Reka core, flash, and edge: a series of powerful multimodal language models","author":"A Ormazabal","year":"2024","unstructured":"Ormazabal A, Zheng C, d\u2019Autume C de M, et al. Reka core, flash, and edge: a series of powerful multimodal language models. 2024. ArXiv:2404.12387"},{"key":"4231_CR126","volume-title":"MPlug-Owl2: revolutionizing multi-modal large language model with modality collaboration","author":"Q H Ye","year":"2023","unstructured":"Ye Q H, Xu H Y, Ye J B, et al. MPlug-Owl2: revolutionizing multi-modal large language model with modality collaboration. 2023. ArXiv:2311.04257"},{"key":"4231_CR127","volume-title":"LLaMA-Adapter V2: parameter-efficient visual instruction model","author":"P Gao","year":"2023","unstructured":"Gao P, Han J M, Zhang R R, et al. LLaMA-Adapter V2: parameter-efficient visual instruction model. 2023. ArXiv:2304.15010"},{"key":"4231_CR128","first-page":"19730","volume-title":"Proceedings of the International Conference on Machine Learning","author":"J N Li","year":"2023","unstructured":"Li J N, Li D X, Savarese S, et al. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of the International Conference on Machine Learning, 2023. 19730\u201319742"},{"key":"4231_CR129","volume-title":"ConvBench: a multi-turn conversation evaluation benchmark with hierarchical capability for large vision-language models","author":"S Liu","year":"2024","unstructured":"Liu S, Ying K, Zhang H, et al. ConvBench: a multi-turn conversation evaluation benchmark with hierarchical capability for large vision-language models. 2024. ArXiv:2403.20194"},{"key":"4231_CR130","volume-title":"Proceedings of the 12th International Conference on Learning Representations","author":"J Hu","year":"2024","unstructured":"Hu J, Yao Y, Wang C Y, et al. Large multilingual models pivot zero-shot multimodal learning across languages. In: Proceedings of the 12th International Conference on Learning Representations, 2024"},{"key":"4231_CR131","first-page":"1918","volume-title":"Proceedings of the 27th International Conference on Computational Linguistics","author":"N Shimizu","year":"2018","unstructured":"Shimizu N, Rong N, Miyazaki T, et al. Visual question answering dataset for bilingual image understanding: a study of cross-lingual transfer using attention maps. In: Proceedings of the 27th International Conference on Computational Linguistics, 2018. 1918\u20131928"},{"key":"4231_CR132","first-page":"11748","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"Y Zhang","year":"2022","unstructured":"Zhang Y, Wan X. BiRdQA: a bilingual dataset for question answering on tricky riddles. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2022. 11748\u201311756"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4231-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-024-4231-5","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4231-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T22:03:16Z","timestamp":1768860196000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-024-4231-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":132,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2024,12]]}},"alternative-id":["4231"],"URL":"https:\/\/doi.org\/10.1007\/s11432-024-4231-5","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"value":"1674-733X","type":"print"},{"value":"1869-1919","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12]]},"assertion":[{"value":"15 May 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 September 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 November 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 December 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"220101"}}