{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T17:08:26Z","timestamp":1777568906672,"version":"3.51.4"},"publisher-location":"Cham","reference-count":99,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729669","type":"print"},{"value":"9783031729676","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72967-6_16","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:08:36Z","timestamp":1730574516000},"page":"273-302","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["MoAI: Mixture of All Intelligence for Large Language and Vision Models"],"prefix":"10.1007","author":[{"given":"Byung-Kwan","family":"Lee","sequence":"first","affiliation":[]},{"given":"Beomchan","family":"Park","sequence":"additional","affiliation":[]},{"given":"Chae","family":"Won Kim","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Man Ro","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"16_CR1","unstructured":"Yi-VL-34B. https:\/\/www.01.ai\/"},{"key":"16_CR2","unstructured":"Achiam, J., et al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"16_CR3","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR4","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"16_CR5","unstructured":"Bengio, Y., L\u00e9onard, N., Courville, A.: Estimating or propagating gradients through stochastic neurons for conditional computation. arXiv preprint arXiv:1308.3432 (2013)"},{"key":"16_CR6","doi-asserted-by":"publisher","unstructured":"Biederman, I., Mezzanotte, R.J., Rabinowitz, J.C.: Scene perception: detecting and judging objects undergoing relational violations. Cogn. Psychol. 14(2), 143\u2013177 (1982). https:\/\/doi.org\/10.1016\/0010-0285(82)90007-X. https:\/\/www.sciencedirect.com\/science\/article\/pii\/001002858290007X","DOI":"10.1016\/0010-0285(82)90007-X"},{"key":"16_CR7","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR8","unstructured":"Cai, Z., et al.: Internlm2 technical report. arXiv preprint arXiv:2403.17297 (2024)"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"16_CR10","unstructured":"Chen, J., et al.: Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"16_CR11","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"16_CR12","unstructured":"Chen, L., et al.: Are we on the right way for evaluating large vision-language models? arXiv preprint arXiv:2403.20330 (2024)"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: Sharegpt4v: improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"issue":"240","key":"16_CR15","first-page":"1","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery, A., et al.: Palm: scaling language modeling with pathways. J. Mach. Learn. Res. 24(240), 1\u2013113 (2023)","journal-title":"J. Mach. Learn. Res."},{"key":"16_CR16","unstructured":"Christiano, P.F., Leike, J., Brown, T., Martic, M., Legg, S., Amodei, D.: Deep reinforcement learning from human preferences. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"16_CR17","unstructured":"Chung, H.W., et al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"16_CR18","unstructured":"XTuner Contributors: Xtuner: a toolkit for efficiently fine-tuning LLM (2023). https:\/\/github.com\/InternLM\/xtuner"},{"key":"16_CR19","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023)"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"16_CR21","unstructured":"Dettmers, T., Pagnoni, A., Holtzman, A., Zettlemoyer, L.: QLoRA: efficient finetuning of quantized LLMs. arXiv preprint arXiv:2305.14314 (2023)"},{"key":"16_CR22","unstructured":"Doveh, S., et al.: Dense and aligned captions (DAC) promote compositional reasoning in VL models. In: Oh, A., Neumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems, vol.\u00a036, pp. 76137\u201376150. Curran Associates, Inc. (2023). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/efe406d6d2674d176cdcd958ce605d17-Paper-Conference.pdf"},{"key":"16_CR23","unstructured":"Du, Y., et al.: PP-OCRv2: bag of tricks for ultra lightweight OCR system. arXiv preprint arXiv:2109.03144 (2021)"},{"key":"16_CR24","unstructured":"Eigen, D., Ranzato, M., Sutskever, I.: Learning factored representations in a deep mixture of experts. arXiv preprint arXiv:1312.4314 (2013)"},{"key":"16_CR25","doi-asserted-by":"publisher","first-page":"373","DOI":"10.1146\/annurev-vision-091718-014809","volume":"5","author":"RA Epstein","year":"2019","unstructured":"Epstein, R.A., Baker, C.I.: Scene perception in the human brain. Annu. Rev. Vis. Sci. 5, 373\u2013397 (2019)","journal-title":"Annu. Rev. Vis. Sci."},{"issue":"1","key":"16_CR26","first-page":"5232","volume":"23","author":"W Fedus","year":"2022","unstructured":"Fedus, W., Zoph, B., Shazeer, N.: Switch transformers: scaling to trillion parameter models with simple and efficient sparsity. J. Mach. Learn. Res. 23(1), 5232\u20135270 (2022)","journal-title":"J. Mach. Learn. Res."},{"key":"16_CR27","doi-asserted-by":"publisher","unstructured":"Freitag, M., Al-Onaizan, Y.: Beam search strategies for neural machine translation. In: Luong, T., Birch, A., Neubig, G., Finch, A. (eds.) Proceedings of the First Workshop on Neural Machine Translation, pp. 56\u201360. Association for Computational Linguistics, Vancouver (2017). https:\/\/doi.org\/10.18653\/v1\/W17-3207. https:\/\/aclanthology.org\/W17-3207","DOI":"10.18653\/v1\/W17-3207"},{"key":"16_CR28","unstructured":"Fu, C., et al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"16_CR29","unstructured":"Gong, T., et al.: Multimodal-GPT: a vision and language model for dialogue with humans. arXiv preprint arXiv:2305.04790 (2023)"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"16_CR31","unstructured":"Hendrycks, D., Gimpel, K.: Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 (2016)"},{"issue":"8","key":"16_CR32","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"16_CR33","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"16_CR34","unstructured":"Iyer, S., et al.: OPT-IML: scaling language model instruction meta learning through the lens of generalization. arXiv preprint arXiv:2212.12017 (2022)"},{"key":"16_CR35","doi-asserted-by":"crossref","unstructured":"Iyyer, M., Yih, W.T., Chang, M.W.: Search-based neural structured learning for sequential question answering. In: Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 1821\u20131831 (2017)","DOI":"10.18653\/v1\/P17-1167"},{"issue":"1","key":"16_CR36","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1162\/neco.1991.3.1.79","volume":"3","author":"RA Jacobs","year":"1991","unstructured":"Jacobs, R.A., Jordan, M.I., Nowlan, S.J., Hinton, G.E.: Adaptive mixtures of local experts. Neural Comput. 3(1), 79\u201387 (1991)","journal-title":"Neural Comput."},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"Jain, J., Li, J., Chiu, M.T., Hassani, A., Orlov, N., Shi, H.: Oneformer: one transformer to rule universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2989\u20132998 (2023)","DOI":"10.1109\/CVPR52729.2023.00292"},{"key":"16_CR38","unstructured":"Jiang, A.Q., et al.: Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)"},{"key":"16_CR39","first-page":"17148","volume":"34","author":"J Kim","year":"2021","unstructured":"Kim, J., Lee, B.K., Ro, Y.M.: Distilling robust and non-robust features in adversarial examples by information bottleneck. Adv. Neural. Inf. Process. Syst. 34, 17148\u201317159 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR40","doi-asserted-by":"crossref","unstructured":"Kim, J., Lee, B.K., Ro, Y.M.: Causal unsupervised semantic segmentation. arXiv preprint arXiv:2310.07379 (2023)","DOI":"10.2139\/ssrn.4967689"},{"key":"16_CR41","doi-asserted-by":"crossref","unstructured":"Kim, J., Lee, B.K., Ro, Y.M.: Demystifying causal features on adversarial examples and causal inoculation for robust network by adversarial instrumental variable regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12302\u201312312 (2023)","DOI":"10.1109\/CVPR52729.2023.01184"},{"key":"16_CR42","unstructured":"Kim, K., Yoon, K., In, Y., Moon, J., Kim, D., Park, C.: Adaptive self-training framework for fine-grained scene graph generation. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=WipsLtH77t"},{"key":"16_CR43","doi-asserted-by":"crossref","unstructured":"Kim, Y., Kim, J., Lee, B.K., Shin, S., Ro, Y.M.: Mitigating dataset bias in image captioning through clip confounder-free captioning network. In: 2023 IEEE International Conference on Image Processing (ICIP), pp. 1720\u20131724. IEEE (2023)","DOI":"10.1109\/ICIP49359.2023.10222502"},{"key":"16_CR44","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"16_CR45","unstructured":"Komatsuzaki, A., et al.: Sparse upcycling: training mixture-of-experts from dense checkpoints. arXiv preprint arXiv:2212.05055 (2022)"},{"key":"16_CR46","unstructured":"Lauren\u00e7on, H., et al.: Obelisc: an open web-scale filtered dataset of interleaved image-text documents. arXiv preprint arXiv:2306.16527 (2023)"},{"key":"16_CR47","unstructured":"Lee, B.K.: Training encoder-attention through fully-connected CRFs for efficient end-to-end lane detection model (2020)"},{"key":"16_CR48","doi-asserted-by":"crossref","unstructured":"Lee, B.K., Chung, S., Kim, C.W., Park, B., Ro, Y.M.: TROL: traversal of layers for large language and vision models. arXiv preprint arXiv:2406.12246 (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.633"},{"key":"16_CR49","doi-asserted-by":"crossref","unstructured":"Lee, B.K., Kim, C.W., Park, B., Ro, Y.M.: Meteor: mamba-based traversal of rationale for large language and vision models. arXiv preprint arXiv:2405.15574 (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.633"},{"key":"16_CR50","doi-asserted-by":"crossref","unstructured":"Lee, B.K., Kim, J., Ro, Y.M.: Masking adversarial damage: finding adversarial saliency for robust and sparse network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15126\u201315136 (2022)","DOI":"10.1109\/CVPR52688.2022.01470"},{"key":"16_CR51","doi-asserted-by":"crossref","unstructured":"Lee, B.K., Kim, J., Ro, Y.M.: Mitigating adversarial vulnerability through causal parameter estimation by adversarial double machine learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4499\u20134509 (2023)","DOI":"10.1109\/ICCV51070.2023.00415"},{"key":"16_CR52","doi-asserted-by":"crossref","unstructured":"Lee, B.K., Park, B., Kim, C.W., Ro, Y.M.: Collavo: crayon large language and vision model. arXiv preprint arXiv:2402.11248 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.66"},{"key":"16_CR53","unstructured":"Lee, B.K., Yu, Y., Ro, Y.M.: Towards adversarial robustness of Bayesian neural network through hierarchical variational inference (2020)"},{"key":"16_CR54","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: a multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"16_CR55","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., Shan, Y.: Seed-bench: Benchmarking multimodal LLMs with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"16_CR56","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"16_CR57","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: TrOCR: transformer-based optical character recognition with pre-trained models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 13094\u201313102 (2023)","DOI":"10.1609\/aaai.v37i11.26538"},{"key":"16_CR58","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"16_CR59","unstructured":"Lin, B., et al.: MoE-LLaVA: mixture of experts for large vision-language models. arXiv preprint arXiv:2401.15947 (2024)"},{"key":"16_CR60","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"16_CR61","doi-asserted-by":"crossref","unstructured":"Liu, B., et al.: Show, deconfound and tell: image captioning with causal inference. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18041\u201318050 (2022)","DOI":"10.1109\/CVPR52688.2022.01751"},{"key":"16_CR62","unstructured":"Liu, F., et al.: Hallusionbench: you see what you think? Or you think what you see? An image-context reasoning benchmark challenging for GPT-4V (ision), LLaVA-1.5, and other multi-modality models. arXiv preprint arXiv:2310.14566 (2023)"},{"key":"16_CR63","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"16_CR64","unstructured":"Liu, H., et al.: Llava-next: improved reasoning, OCR, and world knowledge (2024). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"16_CR65","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023)"},{"key":"16_CR66","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: Mmbench: is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"16_CR67","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"16_CR68","unstructured":"Loshchilov, I., Hutter, F.: SGDR: stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)"},{"key":"16_CR69","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2019). https:\/\/openreview.net\/forum?id=Bkg6RiCqY7"},{"key":"16_CR70","doi-asserted-by":"crossref","unstructured":"Minderer, M., Gritsenko, A.A., Houlsby, N.: Scaling open-vocabulary object detection. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=mQPNcBWjGc","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"16_CR71","first-page":"9564","volume":"35","author":"B Mustafa","year":"2022","unstructured":"Mustafa, B., Riquelme, C., Puigcerver, J., Jenatton, R., Houlsby, N.: Multimodal contrastive learning with limoe: the language-image mixture of experts. Adv. Neural. Inf. Process. Syst. 35, 9564\u20139576 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR72","unstructured":"OpenAI: GPT-4V(ision) system card (2023). https:\/\/openai.com\/research\/gpt-4v-system-card. Accessed 13 Feb 2024"},{"key":"16_CR73","unstructured":"OpenAI: GPT-4V(ision) technical work and authors (2023). https:\/\/openai.com\/contributions\/gpt-4v. Accessed 13 Feb 2024"},{"key":"16_CR74","first-page":"27730","volume":"35","author":"L Ouyang","year":"2022","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. Adv. Neural. Inf. Process. Syst. 35, 27730\u201327744 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR75","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021)"},{"key":"16_CR76","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"key":"16_CR77","unstructured":"Radford, A., et al.: Language models are unsupervised multitask learners (2019)"},{"issue":"1","key":"16_CR78","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"16_CR79","first-page":"8583","volume":"34","author":"C Riquelme","year":"2021","unstructured":"Riquelme, C., et al.: Scaling vision with sparse mixture of experts. Adv. Neural. Inf. Process. Syst. 34, 8583\u20138595 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR80","unstructured":"Shazeer, N., et al.: Outrageously large neural networks: the sparsely-gated mixture-of-experts layer. In: International Conference on Learning Representations (2017). https:\/\/openreview.net\/forum?id=B1ckMDqlg"},{"key":"16_CR81","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"16_CR82","first-page":"3008","volume":"33","author":"N Stiennon","year":"2020","unstructured":"Stiennon, N., et al.: Learning to summarize with human feedback. Adv. Neural. Inf. Process. Syst. 33, 3008\u20133021 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR83","unstructured":"Gemini Team Google, et al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"16_CR84","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"16_CR85","unstructured":"Wang, W., et al.: Cogvlm: visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)"},{"key":"16_CR86","unstructured":"Wei, J., et al.: Finetuned language models are zero-shot learners. In: International Conference on Learning Representations (2022)"},{"key":"16_CR87","unstructured":"Wu, H., et al.: Q-bench: a benchmark for general-purpose foundation models on low-level vision. arXiv preprint arXiv:2309.14181 (2023)"},{"key":"16_CR88","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"178","DOI":"10.1007\/978-3-031-19812-0_11","volume-title":"European Conference on Computer Vision","author":"J Yang","year":"2022","unstructured":"Yang, J., Ang, Y.Z., Guo, Z., Zhou, K., Zhang, W., Liu, Z.: Panoptic scene graph generation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision. LNCS, vol. 13687, pp. 178\u2013196. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19812-0_11"},{"key":"16_CR89","unstructured":"Ye, Q., et al.: mPLUG-Owl: modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"16_CR90","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: mPLUG-Owl2: revolutionizing multi-modal large language model with modality collaboration. arXiv preprint arXiv:2311.04257 (2023)","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"16_CR91","unstructured":"Yu, W., et al.: MM-Vet: evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)"},{"key":"16_CR92","unstructured":"Zhang, P., et al.: Internlm-xcomposer: a vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112 (2023)"},{"key":"16_CR93","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"16_CR94","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ADE20K dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 633\u2013641 (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"16_CR95","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2019","unstructured":"Zhou, B., Zhao, H., Puig, X., Xiao, T., Fidler, S., Barriuso, A., Torralba, A.: Semantic understanding of scenes through the ade20k dataset. Int. J. Comput. Vision 127, 302\u2013321 (2019)","journal-title":"Int. J. Comput. Vision"},{"key":"16_CR96","first-page":"7103","volume":"35","author":"Y Zhou","year":"2022","unstructured":"Zhou, Y., et al.: Mixture-of-experts with expert choice routing. Adv. Neural. Inf. Process. Syst. 35, 7103\u20137114 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR97","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"16_CR98","doi-asserted-by":"crossref","unstructured":"Zong, Z., Song, G., Liu, Y.: DETRs with collaborative hybrid assignments training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6748\u20136758 (2023)","DOI":"10.1109\/ICCV51070.2023.00621"},{"key":"16_CR99","unstructured":"Zoph, B., et al.: ST-MoE: designing stable and transferable sparse expert models. arXiv preprint arXiv:2202.08906 (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72967-6_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T18:21:33Z","timestamp":1732990893000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72967-6_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031729669","9783031729676"],"references-count":99,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72967-6_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}