{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,18]],"date-time":"2026-07-18T22:08:12Z","timestamp":1784412492662,"version":"3.55.0"},"publisher-location":"Cham","reference-count":63,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732348","type":"print"},{"value":"9783031732355","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73235-5_23","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T06:01:53Z","timestamp":1727589713000},"page":"408-424","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":46,"title":["Vary: Scaling up\u00a0the\u00a0Vision Vocabulary for\u00a0Large Vision-Language Model"],"prefix":"10.1007","author":[{"given":"Haoran","family":"Wei","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lingyu","family":"Kong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jinyue","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Liang","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zheng","family":"Ge","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jinrong","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianjian","family":"Sun","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chunrui","family":"Han","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiangyu","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"23_CR1","unstructured":"Alayrac, J., et al.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"key":"23_CR2","unstructured":"Alibaba: Introducing qwen-7b: Open foundation and human-aligned models (of the state-of-the-arts). https:\/\/github.com\/QwenLM\/Qwen-7B (2023)"},{"key":"23_CR3","unstructured":"Bai, J., et al.: Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"23_CR4","unstructured":"Bai, J., et al.: Qwen-vl: a versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"23_CR5","unstructured":"Blecher, L., Cucurull, G., Scialom, T., Stojnic, R.: Nougat: Neural optical understanding for academic documents. arXiv preprint arXiv:2308.13418 (2023)"},{"key":"23_CR6","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR7","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: Unleashing multimodal llm\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"23_CR8","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing gpt-4 with 90%* chatgpt quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/ (2023)"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Davis, B., Morse, B., Price, B., Tensmeyer, C., Wigington, C., Morariu, V.: End-to-end document recognition and understanding with dessurt. In: European Conference on Computer Vision, pp. 280\u2013296. Springer (2022)","DOI":"10.1007\/978-3-031-25069-9_19"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H.: Transvg: end-to-end visual grounding with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1769\u20131779 (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"23_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"23_CR12","unstructured":"Dong, R., et\u00a0al.: Dreamllm: Synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499 (2023)"},{"key":"23_CR13","first-page":"6616","volume":"33","author":"Z Gan","year":"2020","unstructured":"Gan, Z., Chen, Y.C., Li, L., Zhu, C., Cheng, Y., Liu, J.: Large-scale adversarial training for vision-and-language representation learning. Adv. Neural. Inf. Process. Syst. 33, 6616\u20136628 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR14","unstructured":"Hao, Y., et al.: Language models are general-purpose interfaces. arXiv preprint arXiv:2206.06336 (2022)"},{"key":"23_CR15","unstructured":"Huang, S., et\u00a0al.: Language is not all you need: Aligning perception with language models. arXiv preprint arXiv:2302.14045 (2023)"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: Referitgame: referring to objects in photographs of natural scenes. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp. 787\u2013798 (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Kim, G., et al.: Ocr-free document understanding transformer. In: European Conference on Computer Vision, pp. 498\u2013517. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"23_CR18","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"issue":"7","key":"23_CR19","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A., et al.: The open images dataset v4: unified image classification, object detection, and visual relationship detection at scale. Int. J. Comput. Vision 128(7), 1956\u20131981 (2020)","journal-title":"Int. J. Comput. Vision"},{"key":"23_CR20","unstructured":"Lee, K., et al.: Pix2struct: screenshot parsing as pretraining for visual language understanding. In: International Conference on Machine Learning, pp. 18893\u201318912. PMLR (2023)"},{"key":"23_CR21","unstructured":"Levenshtein, V.I., et\u00a0al.: Binary codes capable of correcting deletions, insertions, and reversals. In: Soviet physics doklady, vol.\u00a010, pp. 707\u2013710. Soviet Union (1966)"},{"key":"23_CR22","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"23_CR23","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Li, Y., Mao, H., Girshick, R., He, K.: Exploring plain vision transformer backbones for object detection. In: European Conference on Computer Vision, pp. 280\u2013296. Springer (2022)","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Liang, Y., et\u00a0al.: Taskmatrix. ai: Completing tasks by connecting foundation models with millions of apis. arXiv preprint arXiv:2303.16434 (2023)","DOI":"10.34133\/icomputing.0063"},{"key":"23_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"23_CR27","unstructured":"Liu, F., et al.: Matcha: Enhancing visual language pretraining with math reasoning and chart derendering. arXiv preprint arXiv:2212.09662 (2022)"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"23_CR29","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning (2023)"},{"key":"23_CR30","unstructured":"Loshchilov, I., Hutter, F.: Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)"},{"key":"23_CR31","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Masry, A., Long, D.X., Tan, J.Q., Joty, S., Hoque, E.: Chartqa: A benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"23_CR33","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: Docvqa: a dataset for vqa on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Mishra, A., Shekhar, S., Singh, A.K., Chakraborty, A.: Ocr-vqa: visual question answering by reading text in images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 947\u2013952. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"23_CR35","unstructured":"OpenAI: Gpt-4 technical report (2023)"},{"key":"23_CR36","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: NeurIPS (2022)"},{"key":"23_CR37","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"8","key":"23_CR38","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"issue":"1","key":"23_CR39","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"23_CR40","unstructured":"Schuhmann, C., et al.: Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"23_CR41","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8430\u20138439 (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"23_CR42","unstructured":"Shen, Y., Song, K., Tan, X., Li, D., Lu, W., Zhuang, Y.: Hugginggpt: Solving ai tasks with chatgpt and its friends in huggingface. arXiv preprint arXiv:2303.17580 (2023)"},{"key":"23_CR43","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards vqa models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"23_CR44","unstructured":"Taori, R., et al.: Stanford alpaca: An instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca (2023)"},{"key":"23_CR45","unstructured":"Team, M., et\u00a0al.: Introducing mpt-7b: A new standard for open-source, commercially usable llms (2023)"},{"key":"23_CR46","unstructured":"Touvron, H., et al.: Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"23_CR47","unstructured":"Veit, A., Matera, T., Neumann, L., Matas, J., Belongie, S.: Coco-text: Dataset and benchmark for text detection and recognition in natural images. arXiv preprint arXiv:1601.07140 (2016)"},{"key":"23_CR48","unstructured":"Wang, P., et al.: Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"23_CR49","unstructured":"Wang, W., et\u00a0al.: Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. arXiv preprint arXiv:2305.11175 (2023)"},{"key":"23_CR50","unstructured":"Wei, J., et\u00a0al.: Emergent abilities of large language models. arXiv preprint arXiv:2206.07682 (2022)"},{"key":"23_CR51","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual chatgpt: Talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671 (2023)"},{"key":"23_CR52","doi-asserted-by":"crossref","unstructured":"Xu, C., Guo, D., Duan, N., McAuley, J.: Baize: An open-source chat model with parameter-efficient tuning on self-chat data. arXiv preprint arXiv:2304.01196 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.385"},{"key":"23_CR53","unstructured":"Yang, R., et al.: Gpt4tools: teaching large language model to use tools via self-instruction. arXiv preprint arXiv:2305.18752 (2023)"},{"key":"23_CR54","doi-asserted-by":"publisher","unstructured":"Yang, Z., et al.: Unitab: Unifying text and box outputs for grounded vision-language modeling. In: European Conference on Computer Vision. pp. 521\u2013539. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_30","DOI":"10.1007\/978-3-031-20059-5_30"},{"key":"23_CR55","unstructured":"Yang, Z., et al.: Mm-react: Prompting chatgpt for multimodal reasoning and action. arXiv preprint arXiv:2303.11381 (2023)"},{"key":"23_CR56","unstructured":"Ye, J., et\u00a0al.: mplug-docowl: Modularized multimodal large language model for document understanding. arXiv preprint arXiv:2307.02499 (2023)"},{"key":"23_CR57","doi-asserted-by":"crossref","unstructured":"Yu, E., et\u00a0al.: Merlin: Empowering multimodal llms with foresight minds. arXiv preprint arXiv:2312.00589 (2023)","DOI":"10.1007\/978-3-031-73235-5_24"},{"key":"23_CR58","unstructured":"Yu, W., et al.: Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)"},{"key":"23_CR59","unstructured":"Zeng, A., et\u00a0al.: Glm-130b: An open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)"},{"key":"23_CR60","unstructured":"Zhang, A., Zhao, L., Xie, C.W., Zheng, Y., Ji, W., Chua, T.S.: Next-chat: An lmm for chat, detection and segmentation. arXiv preprint arXiv:2311.04498 (2023)"},{"key":"23_CR61","unstructured":"Zhang, S., et\u00a0al.: Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"23_CR62","doi-asserted-by":"crossref","unstructured":"Zhao, L., et\u00a0al.: Chatspot: Bootstrapping multimodal llms via precise referring instruction tuning. arXiv preprint arXiv:2307.09474 (2023)","DOI":"10.24963\/ijcai.2024\/193"},{"key":"23_CR63","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73235-5_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:17:07Z","timestamp":1732828627000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73235-5_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031732348","9783031732355"],"references-count":63,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73235-5_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}