{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:38:22Z","timestamp":1778081902053,"version":"3.51.4"},"publisher-location":"Cham","reference-count":90,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733369","type":"print"},{"value":"9783031733376","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73337-6_9","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T23:02:27Z","timestamp":1730329347000},"page":"148-166","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":53,"title":["BLINK: Multimodal Large Language Models Can See but\u00a0Not Perceive"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-9533-8427","authenticated-orcid":false,"given":"Xingyu","family":"Fu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7540-2413","authenticated-orcid":false,"given":"Yushi","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Bangzheng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Feng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6259-843X","authenticated-orcid":false,"given":"Haoyu","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xudong","family":"Lin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1447-5173","authenticated-orcid":false,"given":"Dan","family":"Roth","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2310-6380","authenticated-orcid":false,"given":"Noah A.","family":"Smith","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5610-4322","authenticated-orcid":false,"given":"Wei-Chiu","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Ranjay","family":"Krishna","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"9_CR1","unstructured":"Introducing the next generation of claude. https:\/\/www.anthropic.com\/news\/claude-3-family (March 2024)"},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Acharya, M., Kafle, K., Kanan, C.: Tallyqa: Answering complex counting questions. In: AAAI (2019)","DOI":"10.1609\/aaai.v33i01.33018076"},{"key":"9_CR3","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: Vqa: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"9_CR5","unstructured":"Awadalla, A., et al.: Openflamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"9_CR6","unstructured":"Bai, J., et al.: Qwen-vl: a versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Balntas, V., Lenc, K., Vedaldi, A., Mikolajczyk, K.: Hpatches: a benchmark and evaluation of handcrafted and learned local descriptors. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.410"},{"issue":"3\u201326","key":"9_CR8","first-page":"2","volume":"2","author":"H Barrow","year":"1978","unstructured":"Barrow, H., Tenenbaum, J., Hanson, A., Riseman, E.: Recovering intrinsic scene characteristics. Comput. Vis. Syst 2(3\u201326), 2 (1978)","journal-title":"Comput. Vis. Syst"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Bell, S., Bala, K., Snavely, N.: Intrinsic images in the wild. ACM Trans. Graph. (SIGGRAPH) 33(4) (2014)","DOI":"10.1145\/2601097.2601206"},{"key":"9_CR10","unstructured":"Berrios, W., Mittal, G., Thrush, T., Kiela, D., Singh, A.: Towards language models that can see: Computer vision through the lens of natural language. arXiv preprint arXiv:2306.16410 (2023)"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Black, M.J., Anandan, P.: A framework for the robust estimation of optical flow. In: 1993 (4th) International Conference on Computer Vision, pp. 231\u2013236. IEEE (1993)","DOI":"10.1109\/ICCV.1993.378214"},{"key":"9_CR12","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR13","volume-title":"Intrinsic Image Decomposition Via Ordinal Shading","author":"C Careaga","year":"2023","unstructured":"Careaga, C., Aksoy, Y.: Intrinsic Image Decomposition Via Ordinal Shading. ACM Trans, Graph (2023)"},{"key":"9_CR14","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12M: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"9_CR15","unstructured":"Chen, J., et al.: Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"9_CR16","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: Sharegpt4v: improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"9_CR17","unstructured":"Chen, W., Fu, Z., Yang, D., Deng, J.: Single-image depth perception in the wild. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"9_CR18","unstructured":"Chen, X., et al.: Pali-x: on scaling up a multilingual vision and language model (2023)"},{"key":"9_CR19","unstructured":"Chowdhery, A., et al.: Palm: Scaling language modeling with pathways (2022)"},{"key":"9_CR20","unstructured":"Contributors, O.: Opencompass: A universal evaluation platform for foundation models. https:\/\/github.com\/open-compass\/opencompass (2023)"},{"key":"9_CR21","unstructured":"Contributors, X.: Xtuner: A toolkit for efficiently fine-tuning llm. https:\/\/github.com\/InternLM\/xtuner (2023)"},{"key":"9_CR22","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"9_CR23","unstructured":"Doctor of, P.E.: Machine perception of three-dimensional, So Lids. Ph.D. thesis, Massachusetts Institute of Technology (1961)"},{"key":"9_CR24","unstructured":"Dong, X., et al.: Internlm-xcomposer2: Mastering free-form text-image composition and comprehension in vision-language large model. arXiv preprint arXiv:2401.16420 (2024)"},{"key":"9_CR25","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: Eva: exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19358\u201319369 (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"9_CR26","unstructured":"Fu, C., et\u00a0al.: Mme: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"9_CR27","unstructured":"Fu, S., et al.: Dreamsim: learning new dimensions of human visual similarity using synthetic data (2023)"},{"key":"9_CR28","unstructured":"Fu, X., He, M., Lu, Y., Wang, W.Y., Roth, D.: Commonsense-t2i challenge: can text-to-image generation models understand commonsense? arXiv preprint arXiv:2406.07546 (2024)"},{"key":"9_CR29","doi-asserted-by":"publisher","unstructured":"Fu, X., et al.: Generate then select: open-ended visual question answering guided by world knowledge. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Findings of the Association for Computational Linguistics: ACL 2023, pp. 2333\u20132346. Association for Computational Linguistics, Toronto, Canada (Jul 2023). https:\/\/doi.org\/10.18653\/v1\/2023.findings-acl.147, https:\/\/aclanthology.org\/2023.findings-acl.147","DOI":"10.18653\/v1\/2023.findings-acl.147"},{"key":"9_CR30","doi-asserted-by":"publisher","unstructured":"Fu, X., Zhou, B., Chandratreya, I., Vondrick, C., Roth, D.: There\u2019s a time and place for reasoning beyond the image. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). pp. 1138\u20131149. Association for Computational Linguistics, Dublin, Ireland (May 2022). https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.81, https:\/\/aclanthology.org\/2022.acl-long.81","DOI":"10.18653\/v1\/2022.acl-long.81"},{"key":"9_CR31","unstructured":"Fu, X., Zhou, B., Chen, S., Yatskar, M., Roth, D.: Interpretable by design visual question answering. arXiv preprint arXiv:2305.14882 (2023)"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in Visual Question Answering. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Guan, T., et al.: Hallusionbench: an advanced diagnostic suite for entangled language hallucination & visual illusion in large vision-language models (2023)","DOI":"10.1109\/CVPR52733.2024.01363"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: Lvis: a dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"9_CR35","unstructured":"Harris, C., Stephens, M., et\u00a0al.: A combined corner and edge detector. In: Alvey vision conference. vol.\u00a015, pp. 10\u20135244. Citeseer (1988)"},{"key":"9_CR36","doi-asserted-by":"crossref","unstructured":"Hartley, R., Zisserman, A.: Multiple view geometry in computer vision. Cambridge university press (2003)","DOI":"10.1017\/CBO9780511811685"},{"key":"9_CR37","doi-asserted-by":"crossref","unstructured":"Hu, Y., Hua, H., Yang, Z., Shi, W., Smith, N.A., Luo, J.: Promptcap: prompt-guided task-aware image captioning. arXiv preprint arXiv:2211.09699 (2022)","DOI":"10.1109\/ICCV51070.2023.00277"},{"key":"9_CR38","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: Tifa: accurate and interpretable text-to-image faithfulness evaluation with question answering. arXiv preprint arXiv:2303.11897 (2023)","DOI":"10.1109\/ICCV51070.2023.01866"},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: Visual program distillation: distilling tools and programmatic reasoning into vision-language models. arXiv preprint arXiv:2312.03052 (2023)","DOI":"10.1109\/CVPR52733.2024.00916"},{"key":"9_CR40","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"9_CR41","doi-asserted-by":"crossref","unstructured":"Lai, Z., Purushwalkam, S., Gupta, A.: The functional correspondence problem. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15772\u201315781 (2021)","DOI":"10.1109\/ICCV48922.2021.01548"},{"key":"9_CR42","unstructured":"Li, B., et al.: Seed-bench-2: benchmarking multimodal large language models. arXiv preprint arXiv:2311.17092 (2023)"},{"key":"9_CR43","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., Shan, Y.: Seed-bench: benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"9_CR44","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"9_CR45","doi-asserted-by":"publisher","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part V, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"9_CR46","doi-asserted-by":"publisher","first-page":"635","DOI":"10.1162\/tacl_a_00566","volume":"11","author":"F Liu","year":"2023","unstructured":"Liu, F., Emerson, G., Collier, N.: Visual spatial reasoning. Trans. Assoc. Comput. Linguist. 11, 635\u2013651 (2023)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"9_CR47","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"9_CR48","unstructured":"Liu, H., et al.: Llava-next: improved reasoning, ocr, and world knowledge (January 2024). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"9_CR49","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"9_CR50","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: Mmbench: is your multi-modal model an all-around player? (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"9_CR51","unstructured":"Liu, Y., et\u00a0al.: On the hidden mystery of ocr in large multimodal models. arXiv preprint arXiv:2305.07895 (2023)"},{"key":"9_CR52","unstructured":"Liu, Y., et al.: On the hidden mystery of ocr in large multimodal models (2024)"},{"key":"9_CR53","doi-asserted-by":"crossref","unstructured":"Lowe, D.G.: Object recognition from local scale-invariant features. In: Proceedings of the seventh IEEE International Conference On Computer Vision. vol.\u00a02, pp. 1150\u20131157. IEEE (1999)","DOI":"10.1109\/ICCV.1999.790410"},{"key":"9_CR54","doi-asserted-by":"crossref","unstructured":"Lu, J., et al.: Unified-io 2: scaling autoregressive multimodal models with vision, language, audio, and action. arXiv preprint arXiv:2312.17172 (2023)","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"9_CR55","unstructured":"Lu, P., et al.: Mathvista: evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255 (2023)"},{"key":"9_CR56","doi-asserted-by":"crossref","unstructured":"Marr, D.: Vision: A computational investigation into the human representation and processing of visual information. MIT press (2010)","DOI":"10.7551\/mitpress\/9780262514620.001.0001"},{"issue":"4262","key":"9_CR57","doi-asserted-by":"publisher","first-page":"283","DOI":"10.1126\/science.968482","volume":"194","author":"D Marr","year":"1976","unstructured":"Marr, D., Poggio, T.: Cooperative computation of stereo disparity: a cooperative algorithm is derived for extracting disparity information from stereo image pairs. Science 194(4262), 283\u2013287 (1976)","journal-title":"Science"},{"key":"9_CR58","unstructured":"Min, J., Lee, J., Ponce, J., Cho, M.: Spair-71k: a large-scale benchmark for semantic correspondence. arXiv prepreint arXiv:1908.10543 (2019)"},{"key":"9_CR59","unstructured":"Minsky, M., Papert, S.: An introduction to computational geometry. Cambridge tiass., HIT 479(480), 104 (1969)"},{"key":"9_CR60","unstructured":"OpenAI: Gpt-4 technical report (2023)"},{"key":"9_CR61","unstructured":"Podell, D., et al.: Sdxl: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"9_CR62","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"9_CR63","doi-asserted-by":"crossref","unstructured":"Sarkar, A., Mai, H., Mahapatra, A., Lazebnik, S., Bhattad, A.: Shadows don\u2019t lie and lines can\u2019t bend! generative models don\u2019t know projective geometry... for now. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 28140\u201328149 (2024)","DOI":"10.1109\/CVPR52733.2024.02658"},{"key":"9_CR64","unstructured":"Schuhmann, C., et al.: Laion-400m: open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"9_CR65","doi-asserted-by":"publisher","unstructured":"Schwenk, D., Khandelwal, A., Clark, C., Marino, K., Mottaghi, R.: A-okvqa: a benchmark for visual question answering using world knowledge. In: European Conference on Computer Vision, pp. 146\u2013162. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20074-8_9","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"9_CR66","doi-asserted-by":"crossref","unstructured":"Shtedritski, A., Rupprecht, C., Vedaldi, A.: What does clip know about a red circle? visual prompt engineering for vlms. arXiv preprint arXiv:2304.06712 (2023)","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"9_CR67","doi-asserted-by":"crossref","unstructured":"Sun, J., Shen, Z., Wang, Y., Bao, H., Zhou, X.: LoFTR: detector-free local feature matching with transformers. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00881"},{"key":"9_CR68","unstructured":"Sun, Q., Fang, Y., Wu, L., Wang, X., Cao, Y.: Eva-clip: improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)"},{"key":"9_CR69","unstructured":"Tang, L., Jia, M., Wang, Q., Phoo, C.P., Hariharan, B.: Emergent correspondence from image diffusion. arXiv preprint arXiv:2306.03881 (2023)"},{"key":"9_CR70","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"9_CR71","unstructured":"Team, I.: Internlm: A multilingual language model with progressively enhanced capabilities. https:\/\/github.com\/InternLM\/InternLM (2023)"},{"key":"9_CR72","unstructured":"Team, M.N.: Introducing mpt-7b: A new standard for open-source, commercially usable llms (2023). www.mosaicml.com\/blog\/mpt-7b. Accessed 05 May 2023"},{"issue":"9","key":"9_CR73","doi-asserted-by":"publisher","first-page":"1226","DOI":"10.1109\/TPAMI.2002.1033214","volume":"24","author":"A Torralba","year":"2002","unstructured":"Torralba, A., Oliva, A.: Depth estimation from image structure. IEEE Trans. Pattern Anal. Mach. Intell. 24(9), 1226\u20131238 (2002)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"9_CR74","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"9_CR75","unstructured":"Wang, F., et\u00a0al.: Muirbench: a comprehensive benchmark for robust multi-image understanding. arXiv preprint arXiv:2406.09411 (2024)"},{"key":"9_CR76","doi-asserted-by":"crossref","unstructured":"Wang, J.Y., Adelson, E.H.: Layered representation for motion analysis. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, pp. 361\u2013366. IEEE (1993)","DOI":"10.1109\/CVPR.1993.341105"},{"key":"9_CR77","unstructured":"Wang, W., et al.: Cogvlm: visual expert for pretrained language models (2023)"},{"key":"9_CR78","unstructured":"Wang, X., et al.: Self-consistency improves chain of thought reasoning in language models. arXiv preprint arXiv:2203.11171 (2022)"},{"key":"9_CR79","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Dire for diffusion-generated image detection. arXiv preprint arXiv:2303.09295 (2023)","DOI":"10.1109\/ICCV51070.2023.02051"},{"key":"9_CR80","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models (2022)"},{"key":"9_CR81","unstructured":"Yan, A., et\u00a0al.: List items one by one: a new data source and learning paradigm for multimodal llms. arXiv preprint arXiv:2404.16375 (2024)"},{"key":"9_CR82","unstructured":"Yang, J., Zhang, H., Li, F., Zou, X., Li, C., Gao, J.: Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v. arXiv preprint arXiv:2310.11441 (2023)"},{"key":"9_CR83","doi-asserted-by":"crossref","unstructured":"Yang, L., Kang, B., Huang, Z., Xu, X., Feng, J., Zhao, H.: Depth anything: unleashing the power of large-scale unlabeled data. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"9_CR84","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: An empirical study of gpt-3 for few-shot knowledge-based vqa. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 3081\u20133089 (2022)","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"9_CR85","unstructured":"Yang, Z., et al.: The dawn of lmms: preliminary explorations with gpt-4v (ision). arXiv preprint9 (2023). arxiv:2309.17421"},{"key":"9_CR86","unstructured":"Yu, W., et al.: Mm-vet: evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)"},{"key":"9_CR87","doi-asserted-by":"crossref","unstructured":"Yue, X., : Mmmu: a massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. arXiv preprint arXiv:2311.16502 (2023)","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"9_CR88","first-page":"27469","volume":"35","author":"Y Ze","year":"2022","unstructured":"Ze, Y., Wang, X.: Category-level 6d object pose estimation in the wild: a semi-supervised learning approach and a new dataset. Adv. Neural. Inf. Process. Syst. 35, 27469\u201327483 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR89","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., Choi, Y.: From recognition to cognition: Visual commonsense reasoning. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (June 2019)","DOI":"10.1109\/CVPR.2019.00688"},{"key":"9_CR90","unstructured":"Zheng, L., et\u00a0al.: Judging llm-as-a-judge with mt-bench and chatbot arena. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73337-6_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T15:01:04Z","timestamp":1732978864000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73337-6_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031733369","9783031733376"],"references-count":90,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73337-6_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}