{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:48:24Z","timestamp":1775580504790,"version":"3.50.1"},"publisher-location":"Cham","reference-count":95,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733963","type":"print"},{"value":"9783031733970","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73397-0_4","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:05:41Z","timestamp":1730574341000},"page":"55-73","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["GiT: Towards Generalist Vision Transformer Through Universal Language Interface"],"prefix":"10.1007","author":[{"given":"Haiyang","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shaoshuai","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Muhammad Ferjad","family":"Naeem","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bernt","family":"Schiele","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liwei","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"4_CR1","unstructured":"Aakanksha, C., et\u00a0al.: Palm: scaling language modeling with pathways. In: JMLR (2023)"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: Nocaps: novel object captioning at scale. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"key":"4_CR3","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"key":"4_CR4","unstructured":"Alec, R., et\u00a0al.: Language models are unsupervised multitask learners. In: OpenAI blog (2019)"},{"key":"4_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1007\/978-3-319-46454-1_24","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Anderson","year":"2016","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 382\u2013398. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24"},{"key":"4_CR6","unstructured":"Bao, H., et al.: Vlmo: unified vision-language pre-training with mixture-of-modality-experts. In: NeurIPS (2022)"},{"key":"4_CR7","unstructured":"Bavishi, R., et al.: Introducing our multimodal models (2023). https:\/\/www.adept.ai\/blog\/fuyu-8b"},{"key":"4_CR8","unstructured":"Bommasani, R., et\u00a0al.: On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)"},{"key":"4_CR9","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: NeurIPS (2020)"},{"key":"4_CR10","doi-asserted-by":"crossref","unstructured":"Caelles, S., Maninis, K.K., Pont-Tuset, J., Leal-Taix\u00e9, L., Cremers, D., Van\u00a0Gool, L.: One-shot video object segmentation. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.565"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., Ferrari, V.: Coco-stuff: thing and stuff classes in context. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"4_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12m: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"4_CR14","unstructured":"Chen, K., et al.: MMDetection: open mmlab detection toolbox and benchmark. arXiv preprint arXiv:1906.07155 (2019)"},{"key":"4_CR15","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"4_CR16","unstructured":"Chen, L.C., Papandreou, G., Schroff, F., Adam, H.: Rethinking atrous convolution for semantic image segmentation. In: CVPR (2017)"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Zhu, Y., Papandreou, G., Schroff, F., Adam, H.: Encoder-decoder with atrous separable convolution for semantic image segmentation. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"4_CR18","unstructured":"Chen, M., et al.: Generative pretraining from pixels. In: ICML (2020)"},{"key":"4_CR19","unstructured":"Chen, T., Saxena, S., Li, L., Fleet, D.J., Hinton, G.: Pix2seq: a language modeling framework for object detection. In: ICLR (2022)"},{"key":"4_CR20","unstructured":"Chen, T., Saxena, S., Li, L., Lin, T.Y., Fleet, D.J., Hinton, G.E.: A unified sequence interface for vision tasks. In: NeurIPS (2022)"},{"key":"4_CR21","unstructured":"Chen, X., et al.: Microsoft coco captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"4_CR22","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., et al.: Uniter: universal image-text representation learning. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"4_CR24","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: ICML (2021)"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"4_CR26","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning. In: NeurIPS (2023)"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Ververas, E., Kotsia, I., Zafeiriou, S.: Retinaface: single-shot multi-level face localisation in the wild. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"4_CR28","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"4_CR29","unstructured":"Finn, C., Abbeel, P., Levine, S.: Model-agnostic meta-learning for fast adaptation of deep networks. In: ICML. PMLR (2017)"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast r-cnn. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"4_CR31","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"4_CR33","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"4_CR34","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"4_CR35","unstructured":"III\/4, I.W.: ISPRS 2D Semantic Labeling Contest. https:\/\/www.isprs.org\/education\/benchmarks\/UrbanSemLab\/2d-sem-label-potsdam.aspx"},{"key":"4_CR36","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: Mdetr-modulated detection for end-to-end multi-modal understanding. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"4_CR37","unstructured":"Kenton, J.D.M.W.C., Toutanova, L.K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT (2019)"},{"key":"4_CR38","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization (2015)"},{"key":"4_CR39","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"4_CR40","doi-asserted-by":"publisher","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","DOI":"10.1007\/s11263-016-0981-7"},{"key":"4_CR41","doi-asserted-by":"crossref","unstructured":"Kuznetsova, A., et\u00a0al.: The open images dataset v4: unified image classification, object detection, and visual relationship detection at scale. In: IJCV (2020)","DOI":"10.1007\/s11263-020-01316-z"},{"key":"4_CR42","doi-asserted-by":"crossref","unstructured":"Li, H., et al.: Uni-perceiver v2: a generalist model for large-scale vision and vision-language tasks. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00264"},{"key":"4_CR43","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML (2023)"},{"key":"4_CR44","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML (2022)"},{"key":"4_CR45","doi-asserted-by":"crossref","unstructured":"Li, L.H., et\u00a0al.: Grounded language-image pre-training. In: CVPR (2022)","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"4_CR46","doi-asserted-by":"publisher","unstructured":"Li, Y., Mao, H., Girshick, R., He, K.: Exploring plain vision transformer backbones for\u00a0object detection. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part IX, pp. 280\u2013296. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_17","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"4_CR47","doi-asserted-by":"publisher","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"4_CR48","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"4_CR49","doi-asserted-by":"crossref","unstructured":"Liu, S., et\u00a0al.: Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"4_CR50","doi-asserted-by":"crossref","unstructured":"Liu, Z., Luo, P., Qiu, S., Wang, X., Tang, X.: Deepfashion: powering robust clothes recognition and retrieval with rich annotations. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.124"},{"key":"4_CR51","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"4_CR52","unstructured":"Lu, J., Clark, C., Zellers, R., Mottaghi, R., Kembhavi, A.: UNIFIED-IO: a unified model for vision, language, and multi-modal tasks. In: ICLR (2023)"},{"key":"4_CR53","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"4_CR54","doi-asserted-by":"crossref","unstructured":"Ning, J., et al.: All in tokens: unifying output space of visual tasks via soft token. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01822"},{"key":"4_CR55","unstructured":"OpenAI. Chatgpt (2022). https:\/\/openai.com\/blog\/chatgpt"},{"key":"4_CR56","unstructured":"OpenAI. Gpt-4 technical report (2023)"},{"key":"4_CR57","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.: Im2text: describing images using 1 million captioned photographs. In: NeurIPS, vol. 24 (2011)"},{"key":"4_CR58","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. In: NeurIPS, vol. 35 (2022)"},{"key":"4_CR59","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"4_CR60","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"key":"4_CR61","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. In: JMLR (2020)"},{"key":"4_CR62","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: ICML (2021)"},{"key":"4_CR63","unstructured":"Reed, S., et\u00a0al.: A generalist agent. In: TMLR (2022)"},{"key":"4_CR64","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. In: NeurIPS (2015)"},{"key":"4_CR65","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"4_CR66","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"4_CR67","doi-asserted-by":"crossref","unstructured":"Song, S., Lichtenberg, S.P., Xiao, J.: Sun RGB-D: a RGB-D scene understanding benchmark suite. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"4_CR68","doi-asserted-by":"crossref","unstructured":"Staal, J., Abr\u00e0moff, M.D., Niemeijer, M., Viergever, M.A., Van\u00a0Ginneken, B.: Ridge-based vessel segmentation in color images of the retina. In: TMI (2004)","DOI":"10.1109\/TMI.2004.825627"},{"key":"4_CR69","unstructured":"Taori, R., et al.: Stanford alpaca: an instruction-following llama model (2023). https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"4_CR70","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"4_CR71","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"4_CR72","unstructured":"Van Den\u00a0Oord, A., Vinyals, O., et\u00a0al.: Neural discrete representation learning. In: NeurIPS (2017)"},{"key":"4_CR73","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"4_CR74","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: DSVT: dynamic sparse voxel transformer with rotated sets. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01299"},{"key":"4_CR75","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: Unitr: a unified and efficient multi-modal transformer for bird\u2019s-eye-view representation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00625"},{"key":"4_CR76","unstructured":"Wang, J., Zheng, Z., Ma, A., Lu, X., Zhong, Y.: Loveda: a remote sensing land-cover dataset for domain adaptive semantic segmentation. In: NeurIPS (2021)"},{"key":"4_CR77","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: ICML (2022)"},{"key":"4_CR78","unstructured":"Wang, W., et\u00a0al.: Visionllm: large language model is also an open-ended decoder for vision-centric tasks. In: NeurIPS (2023)"},{"key":"4_CR79","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: Beit pretraining for all vision and vision-language tasks. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"4_CR80","doi-asserted-by":"crossref","unstructured":"Wang, Y., Chen, X., Cao, L., Huang, W., Sun, F., Wang, Y.: Multimodal token fusion for vision transformers. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01187"},{"key":"4_CR81","unstructured":"Wu, Y., et\u00a0al.: Google\u2019s neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 (2016)"},{"key":"4_CR82","doi-asserted-by":"crossref","unstructured":"Xie, E., et al.: Polarmask: single shot instance segmentation with polar representation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01221"},{"key":"4_CR83","doi-asserted-by":"crossref","unstructured":"Xu, W., Wang, H., Qi, F., Lu, C.: Explicit shape encoding for real-time instance segmentation. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00527"},{"key":"4_CR84","unstructured":"Yamazaki, K., et\u00a0al.: Aerialformer: multi-resolution transformer for aerial image segmentation. arXiv preprint arXiv:2306.06842 (2023)"},{"key":"4_CR85","doi-asserted-by":"crossref","unstructured":"Yang, S., Luo, P., Loy, C.C., Tang, X.: Wider face: a face detection benchmark. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.596"},{"key":"4_CR86","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: Unitab: unifying text and box outputs for grounded vision-language modeling. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20059-5_30"},{"key":"4_CR87","doi-asserted-by":"crossref","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: ECCV. Springer (2016)","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"4_CR88","unstructured":"Zhang, H., et al.: Dino: Detr with improved denoising anchor boxes for end-to-end object detection. In: ICLR (2022)"},{"key":"4_CR89","unstructured":"Zhang, S., et\u00a0al.: Opt: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"4_CR90","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ade20k dataset. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"4_CR91","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"4_CR92","unstructured":"Zhu, J., et al.: Uni-perceiver-MOE: learning sparse generalist models with conditional MOEs. In: NeurIPS (2022)"},{"key":"4_CR93","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: deformable transformers for end-to-end object detection. In: ICLR (2020)"},{"key":"4_CR94","doi-asserted-by":"crossref","unstructured":"Zhu, X., et al.: Uni-perceiver: pre-training unified architecture for generic perception for zero-shot and few-shot tasks. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01630"},{"key":"4_CR95","doi-asserted-by":"crossref","unstructured":"Zou, X., et\u00a0al.: Generalized decoding for pixel, image, and language. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01451"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73397-0_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T18:21:47Z","timestamp":1732990907000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73397-0_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031733963","9783031733970"],"references-count":95,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73397-0_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}