{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:28:54Z","timestamp":1778081334961,"version":"3.51.4"},"publisher-location":"Cham","reference-count":90,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732256","type":"print"},{"value":"9783031732263","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73226-3_27","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T15:02:57Z","timestamp":1730386977000},"page":"475-494","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["PartGLEE: A Foundation Model for\u00a0Recognizing and\u00a0Parsing Any Objects"],"prefix":"10.1007","author":[{"given":"Junyi","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junfeng","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weizhi","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Song","family":"Bai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiang","family":"Bai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"27_CR1","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems, vol.\u00a035, pp. 23716\u201323736 (2022)"},{"key":"27_CR2","unstructured":"Brohan, A., et\u00a0al.: RT-1: robotics transformer for real-world control at scale. arXiv preprint arXiv:2212.06817 (2022)"},{"key":"27_CR3","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol.\u00a033, pp. 1877\u20131901 (2020)"},{"key":"27_CR4","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"27_CR5","first-page":"31333","volume":"35","author":"T Chen","year":"2022","unstructured":"Chen, T., Saxena, S., Li, L., Lin, T.Y., Fleet, D.J., Hinton, G.E.: A unified sequence interface for vision tasks. Adv. Neural. Inf. Process. Syst. 35, 31333\u201331346 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"27_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., Mottaghi, R., Liu, X., Fidler, S., Urtasun, R., Yuille, A.: Detect what you can: detecting and representing objects using holistic models and body parts. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1971\u20131978 (2014)","DOI":"10.1109\/CVPR.2014.254"},{"key":"27_CR7","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"27_CR8","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"27_CR9","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"27_CR10","doi-asserted-by":"crossref","unstructured":"Dong, J., Chen, Q., Shen, X., Yang, J., Yan, S.: Towards unified human parsing and pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 843\u2013850 (2014)","DOI":"10.1109\/CVPR.2014.113"},{"key":"27_CR11","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"27_CR12","doi-asserted-by":"crossref","unstructured":"Fang, Y., Sun, Q., Wang, X., Huang, T., Wang, X., Cao, Y.: EVA-02: a visual representation for neon genesis. arXiv preprint arXiv:2303.11331 (2023)","DOI":"10.2139\/ssrn.4813567"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: EVA: exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19358\u201319369 (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"27_CR14","doi-asserted-by":"crossref","unstructured":"de\u00a0Geus, D., Meletis, P., Lu, C., Wen, X., Dubbelman, G.: Part-aware panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5485\u20135494 (2021)","DOI":"10.1109\/CVPR46437.2021.00544"},{"key":"27_CR15","doi-asserted-by":"crossref","unstructured":"Gong, K., Liang, X., Zhang, D., Shen, X., Lin, L.: Look into person: self-supervised structure-sensitive learning and a new benchmark for human parsing. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 932\u2013940 (2017)","DOI":"10.1109\/CVPR.2017.715"},{"key":"27_CR16","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"27_CR17","doi-asserted-by":"crossref","unstructured":"He, J., Chen, J., Lin, M.X., Yu, Q., Yuille, A.L.: Compositor: bottom-up clustering and compositing for robust part and object segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11259\u201311268 (2023)","DOI":"10.1109\/CVPR52729.2023.01083"},{"key":"27_CR18","doi-asserted-by":"publisher","unstructured":"He, J., et al.: PartImageNet: a large, high-quality dataset of parts. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13668, pp. 128\u2013145. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20074-8_8","DOI":"10.1007\/978-3-031-20074-8_8"},{"key":"27_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"27_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"27_CR21","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"27_CR22","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6007\u20136017 (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"27_CR23","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"27_CR24","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"issue":"7","key":"27_CR25","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A., et al.: The open images dataset V4: unified image classification, object detection, and visual relationship detection at scale. Int. J. Comput. Vision 128(7), 1956\u20131981 (2020)","journal-title":"Int. J. Comput. Vision"},{"key":"27_CR26","unstructured":"Li, F., et al.: Semantic-SAM: segment and recognize anything at any granularity. arXiv preprint arXiv:2307.04767 (2023)"},{"key":"27_CR27","doi-asserted-by":"crossref","unstructured":"Li, F., et al.: Mask DINO: towards a unified transformer-based framework for object detection and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3041\u20133050 (2023)","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"27_CR28","doi-asserted-by":"crossref","unstructured":"Li, H., et\u00a0al.: Uni-perceiver V2: a generalist model for large-scale vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2691\u20132700 (2023)","DOI":"10.1109\/CVPR52729.2023.00264"},{"key":"27_CR29","unstructured":"Li, J., et al.: Multiple-human parsing in the wild. arXiv preprint arXiv:1705.07206 (2017)"},{"key":"27_CR30","doi-asserted-by":"crossref","unstructured":"Li, L.H., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"27_CR31","doi-asserted-by":"publisher","unstructured":"Li, X., Xu, S., Yang, Y., Cheng, G., Tong, Y., Tao, D.: Panoptic-partformer: learning a unified model for panoptic part segmentation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13687, pp. 729\u2013747. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19812-0_42","DOI":"10.1007\/978-3-031-19812-0_42"},{"key":"27_CR32","doi-asserted-by":"publisher","unstructured":"Li, Y., Mao, H., Girshick, R., He, K.: Exploring plain vision transformer backbones for object detection. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 280\u2013296. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_17","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"27_CR33","unstructured":"Li, Y., Singh, K.K., Xue, Y., Lee, Y.J.: PartGAN: weakly-supervised part decomposition for image generation and segmentation. In: British Machine Vision Conference (BMVC) (2021)"},{"key":"27_CR34","doi-asserted-by":"crossref","unstructured":"Lin, C., Jiang, Y., Qu, L., Yuan, Z., Cai, J.: Generative region-language pretraining for open-ended object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13958\u201313968 (2024)","DOI":"10.1109\/CVPR52733.2024.01324"},{"key":"27_CR35","unstructured":"Lin, C., et al.: Learning object-language alignments for open-vocabulary object detection. In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"27_CR36","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"27_CR37","doi-asserted-by":"publisher","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"27_CR38","unstructured":"Ling, H., Kreis, K., Li, D., Kim, S.W., Torralba, A., Fidler, S.: EditGAN: high-precision semantic image editing. In: Advances in Neural Information Processing Systems, vol.\u00a034, pp. 16331\u201316345 (2021)"},{"key":"27_CR39","doi-asserted-by":"crossref","unstructured":"Liu, S., et\u00a0al.: Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference On Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"27_CR41","unstructured":"Lu, J., Clark, C., Zellers, R., Mottaghi, R., Kembhavi, A.: Unified-IO: a unified model for vision, language, and multi-modal tasks. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"27_CR42","unstructured":"Ma, C., Jiang, Y., Wen, X., Yuan, Z., Qi, X.: CoDet: co-occurrence guided region-word alignment for open-vocabulary object detection. In: Advances in Neural Information Processing Systems, vol.\u00a036 (2023)"},{"key":"27_CR43","doi-asserted-by":"crossref","unstructured":"Ma, C., Jiang, Y., Wu, J., Yuan, Z., Qi, X.: Groma: localized visual tokenization for grounding multimodal large language models. arXiv preprint arXiv:2404.13013 (2024)","DOI":"10.1007\/978-3-031-72658-3_24"},{"key":"27_CR44","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 11\u201320 (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"27_CR45","unstructured":"Meletis, P., Wen, X., Lu, C., de\u00a0Geus, D., Dubbelman, G.: Cityscapes-panoptic-parts and pascal-panoptic-parts datasets for scene understanding. arXiv preprint arXiv:2004.07944 (2020)"},{"key":"27_CR46","doi-asserted-by":"publisher","unstructured":"Michieli, U., Borsato, E., Rossi, L., Zanuttigh, P.: GMNet: graph matching network for\u00a0large scale part semantic segmentation in the wild. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12353, pp. 397\u2013414. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58598-3_24","DOI":"10.1007\/978-3-030-58598-3_24"},{"key":"27_CR47","doi-asserted-by":"crossref","unstructured":"Milletari, F., Navab, N., Ahmadi, S.A.: V-Net: fully convolutional neural networks for volumetric medical image segmentation. In: 2016 Fourth International Conference on 3D Vision (3DV) (2016)","DOI":"10.1109\/3DV.2016.79"},{"key":"27_CR48","unstructured":"Morabia, K., Arora, J., Vijaykumar, T.: Attention-based joint detection of object and semantic part. arXiv preprint arXiv:2007.02419 (2020)"},{"key":"27_CR49","unstructured":"Nair, S., Rajeswaran, A., Kumar, V., Finn, C., Gupta, A.: R3M: a universal visual representation for robot manipulation. In: Conference on Robot Learning, pp. 892\u2013909. PMLR (2023)"},{"key":"27_CR50","doi-asserted-by":"crossref","unstructured":"Ng, X.L., Ong, K.E., Zheng, Q., Ni, Y., Yeo, S.Y., Liu, J.: Animal kingdom: a large and diverse dataset for animal behavior understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19023\u201319034 (2022)","DOI":"10.1109\/CVPR52688.2022.01844"},{"key":"27_CR51","doi-asserted-by":"crossref","unstructured":"Pan, T.Y., Liu, Q., Chao, W.L., Price, B.: Towards open-world segmentation of parts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15392\u201315401 (2023)","DOI":"10.1109\/CVPR52729.2023.01477"},{"key":"27_CR52","unstructured":"Qi, L., et al.: AIMS: all-inclusive multi-level segmentation for anything. In: Advances in Neural Information Processing Systems, vol.\u00a036 (2023)"},{"key":"27_CR53","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"1","key":"27_CR54","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"27_CR55","doi-asserted-by":"crossref","unstructured":"Ramanathan, V., et\u00a0al.: PACO: parts and attributes of common objects. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7141\u20137151 (2023)","DOI":"10.1109\/CVPR52729.2023.00690"},{"key":"27_CR56","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"27_CR57","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"key":"27_CR58","doi-asserted-by":"crossref","unstructured":"Reddy, N.D., Vo, M., Narasimhan, S.G.: CarFusion: combining point tracking and part detection for dynamic 3D reconstruction of vehicles. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1906\u20131915 (2018)","DOI":"10.1109\/CVPR.2018.00204"},{"key":"27_CR59","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 658\u2013666 (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"27_CR60","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"27_CR61","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8430\u20138439 (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"27_CR62","doi-asserted-by":"crossref","unstructured":"Sun, P., Chen, S., Zhu, C., Xiao, F., Luo, P., Xie, S., Yan, Z.: Going denser with open-vocabulary part segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15453\u201315465 (2023)","DOI":"10.1109\/ICCV51070.2023.01417"},{"key":"27_CR63","doi-asserted-by":"crossref","unstructured":"Tang, C., Xie, L., Zhang, X., Hu, X., Tian, Q.: Visual recognition by request. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15265\u201315274 (2023)","DOI":"10.1109\/CVPR52729.2023.01465"},{"key":"27_CR64","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: Caltech-UCSD birds-200-2011 (cub-200-2011). Technical report. CNS-TR-2011-001, California Institute of Technology (2011)"},{"key":"27_CR65","doi-asserted-by":"crossref","unstructured":"Wang, P., Shen, X., Lin, Z., Cohen, S., Price, B., Yuille, A.L.: Joint object and part segmentation using deep learned potentials. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1573\u20131581 (2015)","DOI":"10.1109\/ICCV.2015.184"},{"key":"27_CR66","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"27_CR67","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: Beit pretraining for all vision and vision-language tasks. arXiv preprint arXiv:2208.10442 (2022)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"27_CR68","unstructured":"Wang, X., Li, S., Kallidromitis, K., Kato, Y., Kozuka, K., Darrell, T.: Hierarchical open-vocabulary universal image segmentation. In: Advances in Neural Information Processing Systems, vol.\u00a036 (2023)"},{"key":"27_CR69","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xu, Y., Tsogkas, S., Bai, X., Dickinson, S., Siddiqi, K.: DeepFlux for skeletons in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5287\u20135296 (2019)","DOI":"10.1109\/CVPR.2019.00543"},{"key":"27_CR70","unstructured":"Wei, M., Yue, X., Zhang, W., Kong, S., Liu, X., Pang, J.: OV-parts: towards open-vocabulary part segmentation. In: Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2023)"},{"key":"27_CR71","doi-asserted-by":"crossref","unstructured":"Wu, J., Jiang, Y., Liu, Q., Yuan, Z., Bai, X., Bai, S.: General object foundation model for images and videos at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3783\u20133795 (2024)","DOI":"10.1109\/CVPR52733.2024.00363"},{"key":"27_CR72","doi-asserted-by":"crossref","unstructured":"Xiao, B., et al.: Florence-2: advancing a unified representation for a variety of vision tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4818\u20134829 (2024)","DOI":"10.1109\/CVPR52733.2024.00461"},{"key":"27_CR73","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"733","DOI":"10.1007\/978-3-031-19803-8_43","volume-title":"ECCV 2022","author":"B Yan","year":"2022","unstructured":"Yan, B., et al.: Towards grand unification of object tracking. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13681, pp. 733\u2013751. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19803-8_43"},{"key":"27_CR74","doi-asserted-by":"crossref","unstructured":"Yan, B., et al.: Universal instance perception as object discovery and retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15325\u201315336 (2023)","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"27_CR75","doi-asserted-by":"crossref","unstructured":"Yang, L., Song, Q., Wang, Z., Jiang, M.: Parsing R-CNN for instance-level human analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 364\u2013373 (2019)","DOI":"10.1109\/CVPR.2019.00045"},{"key":"27_CR76","doi-asserted-by":"crossref","unstructured":"Yang, Y., Ramanan, D.: Articulated pose estimation with flexible mixtures-of-parts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1385\u20131392. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995741"},{"key":"27_CR77","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"521","DOI":"10.1007\/978-3-031-20059-5_30","volume-title":"ECCV 2022","author":"Z Yang","year":"2022","unstructured":"Yang, Z., et al.: UniTAB: unifying text and box outputs for grounded vision-language modeling. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13696, pp. 521\u2013539. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_30"},{"key":"27_CR78","doi-asserted-by":"crossref","unstructured":"Yao, L., et al.: DetCLIPv2: scalable open-vocabulary object detection pre-training via word-region alignment. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23497\u201323506 (2023)","DOI":"10.1109\/CVPR52729.2023.02250"},{"key":"27_CR79","unstructured":"Yao, L., et al.: DetCLIP: dictionary-enriched visual-concept paralleled pre-training for open-world detection. In: Advances in Neural Information Processing Systems, vol.\u00a035, pp. 9125\u20139138 (2022)"},{"key":"27_CR80","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"27_CR81","unstructured":"Yuan, L., et\u00a0al.: Florence: a new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"27_CR82","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K.D., Hu, D.H., Chang, S.F.: Open-vocabulary object detection using captions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14393\u201314402 (2021)","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"27_CR83","doi-asserted-by":"crossref","unstructured":"Zhong, Y., et\u00a0al.: RegionCLIP: region-based language-image pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16793\u201316803 (2022)","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"27_CR84","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2018","unstructured":"Zhou, B., et al.: Semantic understanding of scenes through the ADE20K dataset. Int. J. Comput. Vis. 127, 302\u2013321 (2018)","journal-title":"Int. J. Comput. Vis."},{"key":"27_CR85","doi-asserted-by":"crossref","unstructured":"Zhou, T., Wang, W., Liu, S., Yang, Y., Van\u00a0Gool, L.: Differentiable multi-granularity human representation learning for instance-aware human semantic parsing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1622\u20131631 (2021)","DOI":"10.1109\/CVPR46437.2021.00167"},{"key":"27_CR86","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: International Conference on Learning Representations (2021)"},{"key":"27_CR87","doi-asserted-by":"crossref","unstructured":"Zhu, X., et al.: Uni-perceiver: pre-training unified architecture for generic perception for zero-shot and few-shot tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16804\u201316815 (2022)","DOI":"10.1109\/CVPR52688.2022.01630"},{"key":"27_CR88","doi-asserted-by":"crossref","unstructured":"Ziegler, A., Asano, Y.M.: Self-supervised learning of object parts for semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 14502\u201314511 (2022)","DOI":"10.1109\/CVPR52688.2022.01410"},{"key":"27_CR89","doi-asserted-by":"crossref","unstructured":"Zou, X., et al.: Generalized decoding for pixel, image, and language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15116\u201315127 (2023)","DOI":"10.1109\/CVPR52729.2023.01451"},{"key":"27_CR90","unstructured":"Zou, X., et al.: Segment everything everywhere all at once. In: Advances in Neural Information Processing Systems, vol.\u00a036 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73226-3_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T16:02:18Z","timestamp":1732982538000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73226-3_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9783031732256","9783031732263"],"references-count":90,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73226-3_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}