{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T19:03:27Z","timestamp":1762110207147,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":62,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726972"},{"type":"electronic","value":"9783031726989"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72698-9_17","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T04:45:57Z","timestamp":1729831557000},"page":"286-302","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["FlexAttention for\u00a0Efficient High-Resolution Vision-Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3557-4922","authenticated-orcid":false,"given":"Junyan","family":"Li","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9519-093X","authenticated-orcid":false,"given":"Delin","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8644-3895","authenticated-orcid":false,"given":"Tianle","family":"Cai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6847-1621","authenticated-orcid":false,"given":"Peihao","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0518-2099","authenticated-orcid":false,"given":"Yining","family":"Hong","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1619-932X","authenticated-orcid":false,"given":"Zhenfang","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6836-0510","authenticated-orcid":false,"given":"Yikang","family":"Shen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4031-5886","authenticated-orcid":false,"given":"Chuang","family":"Gan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"17_CR1","unstructured":"Achiam, J., et\u00a0al.: Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"17_CR2","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning (2022)"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"17_CR4","unstructured":"Bavishi, R., et al.: Fuyu-8b: a multimodal architecture for ai agents (2024)"},{"key":"17_CR5","doi-asserted-by":"publisher","DOI":"10.1037\/10037-000","volume-title":"Perception and Communication","author":"DE Broadbent","year":"1958","unstructured":"Broadbent, D.E.: Perception and Communication. Pergamon Press, Oxford (1958). https:\/\/doi.org\/10.1037\/10037-000"},{"key":"17_CR6","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners (2020)"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Cai, H., Li, J., Hu, M., Gan, C., Han, S.: Efficientvit: lightweight multi-scale attention for high-resolution dense prediction. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01587"},{"key":"17_CR8","unstructured":"Chen, J., et al.: Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"17_CR9","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing gpt-4 with 90%* chatgpt quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"17_CR10","unstructured":"Child, R., Gray, S., Radford, A., Sutskever, I.: Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509 (2019)"},{"key":"17_CR11","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"17_CR12","unstructured":"DeepMind, G.: Gemini (2023). https:\/\/deepmind.google\/technologies\/gemini"},{"key":"17_CR13","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. arXiv (2020)"},{"key":"17_CR14","unstructured":"Feng, H., Liu, Q., Liu, H., Zhou, W., Li, H., Huang, C.: Docpedia: unleashing the power of large multimodal model in the frequency domain for versatile document understanding. arXiv (2023)"},{"key":"17_CR15","unstructured":"Fu, C., et\u00a0al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"17_CR16","unstructured":"Google: Bard (2023). https:\/\/bard.google.com"},{"key":"17_CR17","unstructured":"Gu, A., Dao, T.: Mamba: linear-time sequence modeling with selective state spaces. arXiv (2023)"},{"key":"17_CR18","unstructured":"Gu, A., Goel, K., R\u00e9, C.: Efficiently modeling long sequences with structured state spaces. In: ICLR (2022)"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Hao, W., Li, C., Li, X., Carin, L., Gao, J.: Towards learning a generic agent for vision-and-language navigation via pre-training. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01315"},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Hong, W., et al.: Cogagent: a visual language model for gui agents (2023)","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"17_CR21","unstructured":"Hou, H., Yu, F.R.: Rwkv-ts: beyond traditional recurrent neural network for time series tasks. arXiv preprint arXiv:2401.09093 (2024)"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: Referitgame: referring to objects in photographs of natural scenes. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"17_CR24","unstructured":"Kenton, J.D.M.W.C., Toutanova, L.K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: NAACL (2019)"},{"key":"17_CR25","unstructured":"Kitaev, N., Kaiser, L., Levskaya, A.: Reformer: the efficient transformer. In: ICLR (2020)"},{"key":"17_CR26","doi-asserted-by":"crossref","unstructured":"Kuckreja, K., Danish, M.S., Naseer, M., Das, A., Khan, S., Khan, F.S.: Geochat: grounded large vision-language model for remote sensing. arXiv preprint arXiv:2311.15826 (2023)","DOI":"10.1109\/CVPR52733.2024.02629"},{"key":"17_CR27","unstructured":"Li, B., Zhang, P., Yang, J., Zhang, Y., Pu, F., Liu, Z.: Otterhd: a high-resolution multi-modality model. arXiv (2023)"},{"key":"17_CR28","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: a multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-vl: a universal encoder for vision and language by cross-modal pre-training. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"17_CR30","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models (2023)"},{"key":"17_CR31","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML (2022)"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Li, W., Duan, L., Xu, D., Tsang, I.W.H.: Text-based image retrieval using progressive multi-instance learning. In: ICCV. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126478"},{"key":"17_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"17_CR36","unstructured":"Liu, H., et al.: Llava-next: improved reasoning, ocr, and world knowledge (2024). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"17_CR37","doi-asserted-by":"crossref","unstructured":"Liu, Y., et\u00a0al.: Mmbench: is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"issue":"12","key":"17_CR38","doi-asserted-by":"publisher","first-page":"8555","DOI":"10.1109\/TGRS.2020.2988782","volume":"58","author":"S Lobry","year":"2020","unstructured":"Lobry, S., Marcos, D., Murray, J., Tuia, D.: Rsvqa: visual question answering for remote sensing data. IEEE Trans. Geosci. Remote Sens. 58(12), 8555\u20138566 (2020)","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"17_CR39","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: NeurIPS (2019)"},{"key":"17_CR40","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: Ok-vqa: a visual question answering benchmark requiring external knowledge. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"issue":"11","key":"17_CR41","doi-asserted-by":"publisher","first-page":"4700","DOI":"10.1523\/JNEUROSCI.13-11-04700.1993","volume":"13","author":"BA Oishausen","year":"1993","unstructured":"Oishausen, B.A., Anderson, C.H., Van Essen, D.C.: A neurobiological model of visual attention and invariant pattern recognition based on dynamic routing of information. J. Neurosci. 13(11), 4700\u20134719 (1993)","journal-title":"J. Neurosci."},{"key":"17_CR42","doi-asserted-by":"crossref","unstructured":"Palmer, S.E.: The psychology of perceptual organization: a transformational approach. In: Beck, J., Hope, B., Rosenfeld, A. (eds.) Human and Machine Vision. Academic Press, New York (1983)","DOI":"10.1016\/B978-0-12-084320-6.50015-3"},{"key":"17_CR43","unstructured":"Peng, H., Pappas, N., Yogatama, D., Schwartz, R., Smith, N.A., Kong, L.: Random feature attention. In: 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, 3\u20137 May 2021. OpenReview.net (2021). https:\/\/openreview.net\/forum?id=QtTKTdVrFBB"},{"key":"17_CR44","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML. PMLR (2021)"},{"key":"17_CR45","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"17_CR46","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: Videobert: a joint model for video and language representation learning. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"17_CR47","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: Lxmert: learning cross-modality encoder representations from transformers. arXiv (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"17_CR48","unstructured":"Tay, Y., Bahri, D., Metzler, D., Juan, D.C., Zhao, Z., Zheng, C.: Synthesizer: rethinking self-attention in transformer models. arXiv preprint arXiv: 2005.00743 (2020)"},{"key":"17_CR49","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv (2023)"},{"key":"17_CR50","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"17_CR51","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"17_CR52","unstructured":"Wang, S., Li, B.Z., Khabsa, M., Fang, H., Ma, H.: Linformer: self-attention with linear complexity. arXiv preprint arXiv: Arxiv-2006.04768 (2020)"},{"key":"17_CR53","unstructured":"Wu, P., Xie, S.: V*: guided visual search as a core mechanism in multimodal llms, 17 (2023). arXiv preprint arXiv:2312.14135"},{"key":"17_CR54","unstructured":"Yang, S., Wang, B., Shen, Y., Panda, R., Kim, Y.: Gated linear attention transformers with hardware-efficient training. arXiv preprint arXiv: 2312.06635 (2023)"},{"key":"17_CR55","doi-asserted-by":"crossref","unstructured":"Yang, Z., Yang, D., Dyer, C., He, X., Smola, A., Hovy, E.: Hierarchical attention networks for document classification. In: NAACL (2016)","DOI":"10.18653\/v1\/N16-1174"},{"key":"17_CR56","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. TACL 2, 67\u201378 (2014)","journal-title":"TACL"},{"key":"17_CR57","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"17_CR58","unstructured":"Yu, W., et al.: Mm-vet: evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)"},{"key":"17_CR59","unstructured":"Zaheer, M., et al.: Big bird: transformers for longer sequences. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, 6\u201312 December 2020, virtual (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/c8512d142a2d849725f31a9a7a361ab9-Abstract.html"},{"key":"17_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: Vinvl: making visual representations matter in vision-language models. In: CVPR 2021 (2021)","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"17_CR61","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J., Gao, J.: Unified vision-language pre-training for image captioning and vqa. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"17_CR62","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72698-9_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T07:22:57Z","timestamp":1732951377000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72698-9_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031726972","9783031726989"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72698-9_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}