{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T16:18:54Z","timestamp":1778602734768,"version":"3.51.4"},"publisher-location":"Cham","reference-count":64,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031728471","type":"print"},{"value":"9783031728488","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72848-8_1","type":"book-chapter","created":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T13:37:52Z","timestamp":1732801072000},"page":"1-18","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["VisionLLaMA: A Unified LLaMA Backbone for\u00a0Vision Tasks"],"prefix":"10.1007","author":[{"given":"Xiangxiang","family":"Chu","sequence":"first","affiliation":[]},{"given":"Jianlin","family":"Su","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Chunhua","family":"Shen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,29]]},"reference":[{"key":"1_CR1","unstructured":"Scao, T.L., et\u00a0al.: BLOOM: a 176B-parameter open-access multilingual language model. arXiv Computer Research Repository (2022)"},{"key":"1_CR2","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. arXiv Computer Research Repository (2016)"},{"key":"1_CR3","unstructured":"Biderman, S., et\u00a0al.: Pythia: a suite for analyzing large language models across training and scaling. In: Proceedings of the International Conference on Machine Learning (2023)"},{"key":"1_CR4","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Proceedings of the Advances in Neural Information Processing Systems (2020)"},{"key":"1_CR5","unstructured":"Chen, S., Wong, S., Chen, L., Tian, Y.: Extending context window of large language models via positional interpolation. arXiv Computer Research Repository (2023)"},{"key":"1_CR6","unstructured":"Chen, X., Liu, Z., Xie, S., He, K.: Deconstructing denoising diffusion models for self-supervised learning. arXiv Computer Research Repository (2024)"},{"key":"1_CR7","unstructured":"Chu, X., et\u00a0al.: MobileVLM: a fast, reproducible and strong vision language assistant for mobile devices. arXiv Computer Research Repository (2023)"},{"key":"1_CR8","unstructured":"Chu, X., et\u00a0al.: MobileVLM v2: faster and stronger baseline for vision language model. arXiv Computer Research Repository (2024)"},{"key":"1_CR9","unstructured":"Chu, X., et al.: Twins: revisiting the design of spatial attention in vision transformers. In: Proceedings of the Advances in Neural Information Processing Systems (2021)"},{"key":"1_CR10","unstructured":"Chu, X., Tian, Z., Zhang, B., Wang, X., Shen, C.: Conditional positional encodings for vision transformers. In: Proceedings of the International Conference on Learning Representation (2023)"},{"key":"1_CR11","unstructured":"Contributors, M.: OpenMMLab\u2019s pre-training toolbox and benchmark (2023)"},{"key":"1_CR12","unstructured":"Contributors, M.: MMSegmentation: OpenMMLab semantic segmentation toolbox and benchmark (2020)"},{"key":"1_CR13","unstructured":"Dao, T.: Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv Computer Research Repository (2023)"},{"key":"1_CR14","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1_CR15","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: Proceedings of the Advances in Neural Information Processing Systems (2021)"},{"key":"1_CR16","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. arXiv Computer Research Repository (2020)"},{"key":"1_CR17","unstructured":"Frantar, E., Ashkboos, S., Hoefler, T., Alistarh, D.: GPTQ: accurate post-training quantization for generative pre-trained transformers. arXiv Computer Research Repository (2022)"},{"key":"1_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"1_CR20","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: Proceedings of the Advances in Neural Information Processing Systems (2017)"},{"key":"1_CR21","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Proceedings of the Advances in Neural Information Processing Systems (2020)"},{"key":"1_CR22","unstructured":"Stable code 3B: Coding on the edge (2024). https:\/\/stability.ai\/"},{"key":"1_CR23","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv Computer Research Repository (2013)"},{"key":"1_CR24","unstructured":"Kynk\u00e4\u00e4nniemi, T., Karras, T., Laine, S., Lehtinen, J., Aila, T.: Improved precision and recall metric for assessing generative models. In: Proceedings of the Advances in Neural Information Processing Systems (2019)"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: LISA: reasoning segmentation via large language model. arXiv Computer Research Repository (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"1_CR26","unstructured":"Li, G., Zheng, H., Liu, D., Wang, C., Su, B., Zheng, C.: SemMAE: semantic-guided masking for learning masked autoencoders. In: Proceedings of the Advances in Neural Information Processing Systems (2022)"},{"key":"1_CR27","doi-asserted-by":"crossref","unstructured":"Li, L., Li, Q., Zhang, B., Chu, X.: Norm tweaking: high-performance low-bit quantization of large language models. In: Proceedings of the AAAI Conference on Artificial Intelligence (2024)","DOI":"10.1609\/aaai.v38i17.29815"},{"key":"1_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1007\/978-3-031-20077-9_17","volume-title":"Computer Vision \u2013 ECCV 2022","author":"Y Li","year":"2022","unstructured":"Li, Y., Mao, H., Girshick, R., He, K.: Exploring plain vision transformer backbones for object detection. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 280\u2013296. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_17"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv Computer Research Repository (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"1_CR30","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Pattern Recogn. (2023)"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zhang, S., Chen, J., Yu, Z., Chen, K., Lin, D.: Improving pixel-based MIM by reducing wasted modeling capability. In: Proceedings Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/ICCV51070.2023.00494"},{"key":"1_CR32","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"1_CR33","doi-asserted-by":"crossref","unstructured":"Ma, N., Goldstein, M., Albergo, M.S., Boffi, N.M., Vanden-Eijnden, E., Xie, S.: SiT: exploring flow and diffusion-based generative models with scalable interpolant transformers. arXiv Computer Research Repository (2024)","DOI":"10.1007\/978-3-031-72980-5_2"},{"key":"1_CR34","unstructured":"Nash, C., Menick, J., Dieleman, S., Battaglia, P.W.: Generating images with sparse representations. arXiv Computer Research Repository (2021)"},{"key":"1_CR35","unstructured":"OpenAI: GPT-4 technical report (2023)"},{"key":"1_CR36","doi-asserted-by":"crossref","unstructured":"Parmar, G., Zhang, R., Zhu, J.Y.: On aliased resizing and surprising subtleties in GAN evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01112"},{"key":"1_CR37","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"1_CR38","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning (2021)"},{"key":"1_CR39","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents (2022)"},{"key":"1_CR40","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"1_CR41","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"1_CR42","unstructured":"Roziere, B., et\u00a0al.: Code llama: Open foundation models for code. arXiv Computer Research Repository (2023)"},{"key":"1_CR43","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training GANs. In: Proceedings of the Advances in Neural Information Processing Systems (2016)"},{"key":"1_CR44","doi-asserted-by":"crossref","unstructured":"Shaw, P., Uszkoreit, J., Vaswani, A.: Self-attention with relative position representations. arXiv Computer Research Repository (2018)","DOI":"10.18653\/v1\/N18-2074"},{"key":"1_CR45","unstructured":"Shazeer, N.: GLU variants improve transformer. arXiv Computer Research Repository (2020)"},{"key":"1_CR46","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: Proceedings of the International Conference on Machine Learning (2015)"},{"key":"1_CR47","doi-asserted-by":"crossref","unstructured":"Su, J., Ahmed, M., Lu, Y., Pan, S., Bo, W., Liu, Y.: RoFormer: enhanced transformer with rotary position embedding. Int. J. Comput. Vision (2023)","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"1_CR48","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., Jegou, H.: Training data-efficient image transformers and distillation through attention. In: Proceedings of the International Conference on Machine Learning (2021)"},{"key":"1_CR49","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"516","DOI":"10.1007\/978-3-031-20053-3_30","volume-title":"Computer Vision \u2013 ECCV 2022","author":"H Touvron","year":"2022","unstructured":"Touvron, H., Cord, M., J\u00e9gou, H.: DeiT III: revenge of the ViT. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13684, pp. 516\u2013533. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20053-3_30"},{"key":"1_CR50","unstructured":"Touvron, H., et al.: Llama: Open and efficient foundation language models. arXiv Computer Research Repository (2023)"},{"key":"1_CR51","unstructured":"Touvron, H., et\u00a0al.: LLaMA 2: open foundation and fine-tuned chat models. arXiv Computer Research Repository (2023)"},{"key":"1_CR52","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Proceedings of the Advances in Neural Information Processing Systems (2017)"},{"key":"1_CR53","unstructured":"Vishniakov, K., Shen, Z., Liu, Z.: Convnet vs transformer, supervised vs CLIP: beyond imagenet accuracy. arXiv Computer Research Repository (2023)"},{"key":"1_CR54","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Pyramid vision transformer: a versatile backbone for dense prediction without convolutions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"1_CR55","doi-asserted-by":"crossref","unstructured":"Wei, C., Fan, H., Xie, S., Wu, C.Y., Yuille, A., Feichtenhofer, C.: Masked feature prediction for self-supervised visual pre-training. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"1_CR56","unstructured":"Wei, F., Zhang, X., Zhang, A., Zhang, B., Chu, X.: Lenna: language enhanced reasoning detection assistant. arXiv Computer Research Repository (2023)"},{"key":"1_CR57","unstructured":"Xiao, G., Lin, J., Seznec, M., Wu, H., Demouth, J., Han, S.: SmoothQuant: accurate and efficient post-training quantization for large language models. In: Proceedings of the International Conference on Machine Learning (2023)"},{"key":"1_CR58","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., Sun, J.: Unified perceptual parsing for scene understanding. In: Proceedings of the European Conference on Computer Vision (2018)","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"1_CR59","doi-asserted-by":"crossref","unstructured":"Xie, Z., et al.: SimMIM: a simple framework for masked image modeling. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"1_CR60","unstructured":"Xiong, W., et\u00a0al.: Effective long-context scaling of foundation models. arXiv Computer Research Repository (2023)"},{"key":"1_CR61","unstructured":"Yang, A., et\u00a0al.: Baichuan 2: open large-scale language models. arXiv Computer Research Repository (2023)"},{"key":"1_CR62","unstructured":"Zhang, B., Sennrich, R.: Root mean square layer normalization. In: Proceedings of the Advances in Neural Information Processing Systems (2019)"},{"key":"1_CR63","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ADE20K dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"1_CR64","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: Proceedings of the International Conference on Learning Representation (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72848-8_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T14:03:33Z","timestamp":1732802613000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72848-8_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,29]]},"ISBN":["9783031728471","9783031728488"],"references-count":64,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72848-8_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,29]]},"assertion":[{"value":"29 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}