{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,20]],"date-time":"2025-08-20T13:24:07Z","timestamp":1755696247668,"version":"3.40.3"},"publisher-location":"Cham","reference-count":92,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729195"},{"type":"electronic","value":"9783031729201"}],"license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72920-1_14","type":"book-chapter","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T08:02:57Z","timestamp":1727683377000},"page":"239-258","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["A Semantic Space is Worth 256 Language Descriptions: Make Stronger Segmentation Models with\u00a0Descriptive Properties"],"prefix":"10.1007","author":[{"given":"Junfei","family":"Xiao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ziqi","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenxuan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiyi","family":"Lan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jieru","family":"Mei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiding","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bingchen","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alan","family":"Yuille","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuyin","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cihang","family":"Xie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,1]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Akata, Z., Perronnin, F., Harchaoui, Z., Schmid, C.: Label-embedding for attribute-based classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 819\u2013826 (2013)","DOI":"10.1109\/CVPR.2013.111"},{"key":"14_CR2","doi-asserted-by":"crossref","unstructured":"Amit, Y., Fink, M., Srebro, N., Ullman, S.: Uncovering shared structures in multiclass classification. In: Proceedings of the 24th International Conference on Machine Learning, pp. 17\u201324 (2007)","DOI":"10.1145\/1273496.1273499"},{"key":"14_CR3","unstructured":"Bai, J., et al.: Qwen-VL: a versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.129661(2), 3 (2023)"},{"key":"14_CR4","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEiT: BERT pre-training of image transformers. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=p-BhZSz59o4"},{"key":"14_CR5","doi-asserted-by":"crossref","unstructured":"Bilen, H., Vedaldi, A.: Weakly supervised deep detection networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2846\u20132854 (2016)","DOI":"10.1109\/CVPR.2016.311"},{"key":"14_CR6","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"14_CR7","unstructured":"Bucher, M., Vu, T.H., Cord, M., P\u00e9rez, P.: Zero-shot semantic segmentation. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"14_CR8","unstructured":"Chen, J., Yang, Z., Zhang, L.: Semantic segment anything. https:\/\/github.com\/fudan-zvg\/Semantic-Segment-Anything (2023)"},{"key":"14_CR9","unstructured":"Chen, J., et al.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"issue":"4","key":"14_CR10","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"LC Chen","year":"2017","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: DeepLab: semantic image segmentation with deep convolutional nets, Atrous convolution, and fully connected CRFs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR11","doi-asserted-by":"crossref","unstructured":"Chen, X., Li, S., Lim, S.N., Torralba, A., Zhao, H.: Open-vocabulary panoptic segmentation with embedding modulation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.00111"},{"key":"14_CR12","unstructured":"Chen, Z., et al.: Vision transformer adapter for dense predictions. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=plKu2GByCNW"},{"key":"14_CR13","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3213\u20133223 (2016)","DOI":"10.1109\/CVPR.2016.350"},{"issue":"9","key":"14_CR14","doi-asserted-by":"publisher","first-page":"92","DOI":"10.1145\/2701413","volume":"58","author":"E Davis","year":"2015","unstructured":"Davis, E., Marcus, G.: Commonsense reasoning and commonsense knowledge in artificial intelligence. Commun. ACM 58(9), 92\u2013103 (2015)","journal-title":"Commun. ACM"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Dekel, O., Keshet, J., Singer, Y.: Large margin hierarchical classification. In: Proceedings of the Twenty-First International Conference on Machine Learning, p.\u00a027 (2004)","DOI":"10.1145\/1015330.1015374"},{"key":"14_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1007\/978-3-319-10590-1_4","volume-title":"Computer Vision \u2013 ECCV 2014","author":"J Deng","year":"2014","unstructured":"Deng, J., et al.: Large-scale object classification using label relation graphs. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part I. LNCS, vol. 8689, pp. 48\u201364. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10590-1_4"},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"14_CR18","doi-asserted-by":"crossref","unstructured":"Ding, J., Xue, N., Xia, G.S., Dai, D.: Decoupling zero-shot semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11583\u201311592 (2022)","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"Durand, T., Mordan, T., Thome, N., Cord, M.: Wildcat: weakly supervised learning of deep convnets for image classification, pointwise localization and segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 642\u2013651 (2017)","DOI":"10.1109\/CVPR.2017.631"},{"key":"14_CR20","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The pascal visual object classes (VOC) challenge. Int. J. Comput. Vision 88, 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vision"},{"key":"14_CR21","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Endres, I., Hoiem, D.: Attribute-centric recognition for cross-category generalization. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 2352\u20132359. IEEE (2010)","DOI":"10.1109\/CVPR.2010.5539924"},{"key":"14_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"762","DOI":"10.1007\/978-3-642-15549-9_55","volume-title":"Computer Vision \u2013 ECCV 2010","author":"R Fergus","year":"2010","unstructured":"Fergus, R., Bernal, H., Weiss, Y., Torralba, A.: Semantic label sharing for learning with many categories. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part I. LNCS, vol. 6311, pp. 762\u2013775. Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-15549-9_55"},{"issue":"11","key":"14_CR23","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1177\/0278364913491297","volume":"32","author":"A Geiger","year":"2013","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R.: Vision meets robotics: The kitti dataset. Int. J. Robot. Res. 32(11), 1231\u20131237 (2013)","journal-title":"Int. J. Robot. Res."},{"key":"14_CR24","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Open-vocabulary image segmentation. In: ECCV (2022)"},{"key":"14_CR25","doi-asserted-by":"publisher","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, TY.: Scaling open-vocabulary image segmentation with image-level labels. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022, ECCV 2022, LNCS, vol. 13696, pp. 540\u2013557. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_31","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"14_CR26","unstructured":"Gu, X., Lin, T.Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)"},{"key":"14_CR27","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1016\/j.media.2016.05.004","volume":"35","author":"M Havaei","year":"2017","unstructured":"Havaei, M., et al.: Brain tumor segmentation with deep neural networks. Med. Image Anal. 35, 18\u201331 (2017)","journal-title":"Med. Image Anal."},{"key":"14_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"108","DOI":"10.1007\/978-3-319-46448-0_7","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Hu","year":"2016","unstructured":"Hu, R., Rohrbach, M., Darrell, T.: Segmentation from natural language expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016, Part I. LNCS, vol. 9905, pp. 108\u2013124. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_7"},{"key":"14_CR29","doi-asserted-by":"crossref","unstructured":"Hu, R., Xu, H., Rohrbach, M., Feng, J., Saenko, K., Darrell, T.: Natural language object retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4555\u20134564 (2016)","DOI":"10.1109\/CVPR.2016.493"},{"key":"14_CR30","doi-asserted-by":"crossref","unstructured":"Huang, S., Lu, Z., Cheng, R., He, C.: FaPN: feature-aligned pyramid network for dense image prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 864\u2013873 (2021)","DOI":"10.1109\/ICCV48922.2021.00090"},{"key":"14_CR31","unstructured":"Jain, J., et al.: SeMask: semantically masked transformers for semantic segmentation. arXiv preprint arXiv:2112.12782 (2021)"},{"key":"14_CR32","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: Mdetr-modulated detection for end-to-end multi-modal understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1780\u20131790 (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"14_CR33","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferItGame: referring to objects in photographs of natural scenes. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 787\u2013798 (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"14_CR34","doi-asserted-by":"publisher","unstructured":"Kim, D., et al.: Learning semantic segmentation from multiple datasets with label shifts. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022, ECCV 2022, LNCS, vol. 13688, pp. 20\u201336. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_2","DOI":"10.1007\/978-3-031-19815-1_2"},{"issue":"5140","key":"14_CR35","doi-asserted-by":"publisher","first-page":"1747","DOI":"10.1126\/science.8259522","volume":"262","author":"BJ Knowlton","year":"1993","unstructured":"Knowlton, B.J., Squire, L.R.: The learning of categories: parallel brain systems for item memory and category knowledge. Science 262(5140), 1747\u20131749 (1993)","journal-title":"Science"},{"key":"14_CR36","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: Lisa: reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692 (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"14_CR37","doi-asserted-by":"crossref","unstructured":"Lambert, J., Liu, Z., Sener, O., Hays, J., Koltun, V.: MSeg: a composite dataset for multi-domain semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2879\u20132888 (2020)","DOI":"10.1109\/CVPR42600.2020.00295"},{"key":"14_CR38","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. In: International Conference on Learning Representations (2022)"},{"key":"14_CR39","unstructured":"Li, F., et al.: Semantic-sam: segment and recognize anything at any granularity. arXiv preprint arXiv:2307.04767 (2023)"},{"key":"14_CR40","doi-asserted-by":"crossref","unstructured":"Li, L., Zhou, T., Wang, W., Li, J., Yang, Y.: Deep hierarchical semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1246\u20131257 (2022)","DOI":"10.1109\/CVPR52688.2022.00131"},{"key":"14_CR41","doi-asserted-by":"crossref","unstructured":"Li, L.H., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"14_CR42","doi-asserted-by":"crossref","unstructured":"Liang, X., Zhou, H., Xing, E.: Dynamic-structured semantic propagation network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 752\u2013761 (2018)","DOI":"10.1109\/CVPR.2018.00085"},{"key":"14_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: Common Objects in Context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part V. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"14_CR44","doi-asserted-by":"crossref","unstructured":"Liu, S., et\u00a0al.: Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"14_CR45","doi-asserted-by":"crossref","unstructured":"Liu, Z., et\u00a0al.: Swin transformer v2: scaling up capacity and resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12009\u201312019 (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"14_CR46","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"issue":"2","key":"14_CR47","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1109\/TIT.1982.1056489","volume":"28","author":"S Lloyd","year":"1982","unstructured":"Lloyd, S.: Least squares quantization in PCM. IEEE Trans. Inf. Theory 28(2), 129\u2013137 (1982)","journal-title":"IEEE Trans. Inf. Theory"},{"key":"14_CR48","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"14_CR49","doi-asserted-by":"crossref","unstructured":"Marszalek, M., Schmid, C.: Semantic hierarchies for visual object recognition. In: 2007 IEEE Conference on Computer Vision and Pattern Recognition. pp.\u00a01\u20137. IEEE (2007)","DOI":"10.1109\/CVPR.2007.383272"},{"key":"14_CR50","unstructured":"Menon, S., Vondrick, C.: Visual classification via description from large language models. In: International Conference on Learning Representations (2023)"},{"key":"14_CR51","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., et al.: The role of context for object detection and semantic segmentation in the wild. In: Proceedings of the IEEE Conference on Computer vision and Pattern Recognition, pp. 891\u2013898 (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"14_CR52","doi-asserted-by":"crossref","unstructured":"Mukhoti, J., et al.: Open vocabulary semantic segmentation with patch aligned contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19413\u201319423 (2023)","DOI":"10.1109\/CVPR52729.2023.01860"},{"key":"14_CR53","unstructured":"Palatucci, M., Pomerleau, D., Hinton, G.E., Mitchell, T.M.: Zero-shot learning with semantic output codes. In: Advances in Neural Information Processing Systems, vol. 22 (2009)"},{"key":"14_CR54","unstructured":"Peng, Z., Dong, L., Bao, H., Ye, Q., Wei, F.: Beit v2: masked image modeling with vector-quantized visual tokenizers. arXiv preprint arXiv:2208.06366 (2022)"},{"key":"14_CR55","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"14_CR56","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"14_CR57","first-page":"10353","volume":"35","author":"Y Rao","year":"2022","unstructured":"Rao, Y., Zhao, W., Tang, Y., Zhou, J., Lim, S.N., Lu, J.: Hornet: efficient high-order spatial interactions with recursive gated convolutions. Adv. Neural. Inf. Process. Syst. 35, 10353\u201310366 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"14_CR58","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Sentence-bert: sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084 (2019)","DOI":"10.18653\/v1\/D19-1410"},{"key":"14_CR59","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015, Part III. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"14_CR60","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"556","DOI":"10.1007\/978-3-319-24553-9_68","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"HR Roth","year":"2015","unstructured":"Roth, H.R., et al.: DeepOrgan: multi-level deep convolutional networks for automated pancreas segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9349, pp. 556\u2013564. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24553-9_68"},{"key":"14_CR61","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-642-35749-7_1","volume-title":"Trends and Topics in Computer Vision","author":"O Russakovsky","year":"2012","unstructured":"Russakovsky, O., Fei-Fei, L.: Attribute learning in large-scale datasets. In: Kutulakos, K.N. (ed.) ECCV 2010, Part I. LNCS, vol. 6553, pp. 1\u201314. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-35749-7_1"},{"key":"14_CR62","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"242","DOI":"10.1007\/978-3-642-33715-4_18","volume-title":"Computer Vision \u2013 ECCV 2012","author":"V Sharmanska","year":"2012","unstructured":"Sharmanska, V., Quadrianto, N., Lampert, C.H.: Augmented attribute representations. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012, Part V. LNCS, vol. 7576, pp. 242\u2013255. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33715-4_18"},{"issue":"1","key":"14_CR63","doi-asserted-by":"publisher","first-page":"333","DOI":"10.1016\/j.patcog.2011.05.017","volume":"45","author":"AM Tousch","year":"2012","unstructured":"Tousch, A.M., Herbin, S., Audibert, J.Y.: Semantic hierarchies for image annotation: a survey. Pattern Recogn. 45(1), 333\u2013345 (2012)","journal-title":"Pattern Recogn."},{"key":"14_CR64","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, pp. 10347\u201310357. PMLR (2021)"},{"key":"14_CR65","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"14_CR66","unstructured":"Wang, W., et\u00a0al.: Visionllm: large language model is also an open-ended decoder for vision-centric tasks. arXiv preprint arXiv:2305.11175 (2023)"},{"key":"14_CR67","doi-asserted-by":"crossref","unstructured":"Wang, X., Ye, Y., Gupta, A.: Zero-shot recognition via semantic embeddings and knowledge graphs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6857\u20136866 (2018)","DOI":"10.1109\/CVPR.2018.00717"},{"key":"14_CR68","unstructured":"Wolf, T., et\u00a0al.: Huggingface\u2019s transformers: state-of-the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)"},{"key":"14_CR69","doi-asserted-by":"crossref","unstructured":"Wu, C., Lin, Z., Cohen, S., Bui, T., Maji, S.: PhraseCut: language-based image segmentation in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10216\u201310225 (2020)","DOI":"10.1109\/CVPR42600.2020.01023"},{"key":"14_CR70","doi-asserted-by":"crossref","unstructured":"Xian, Y., Choudhury, S., He, Y., Schiele, B., Akata, Z.: Semantic projection network for zero-and few-label semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8256\u20138265 (2019)","DOI":"10.1109\/CVPR.2019.00845"},{"key":"14_CR71","unstructured":"Xiao, J., Xu, Z., Yuille, A., Yan, S., Wang, B.: PaLM2-VAdapter: progressively aligned language model makes a strong vision-language adapter. arXiv preprint arXiv:2402.10896 (2024)"},{"key":"14_CR72","unstructured":"Xiao, J., Xu, Z., Lan, S., Yu, Z., Yuille, A., Anandkumar, A.: 1st place solution of the robust vision challenge 2022 semantic segmentation track. arXiv preprint arXiv:2210.12852 (2022)"},{"key":"14_CR73","doi-asserted-by":"crossref","unstructured":"Xiao, S., Liu, Z., Zhang, P., Muennighoff, N.: C-pack: packaged resources to advance general Chinese embedding (2023)","DOI":"10.1145\/3626772.3657878"},{"key":"14_CR74","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., Sun, J.: Unified perceptual parsing for scene understanding. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 418\u2013434 (2018)","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"14_CR75","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: SegFormer: simple and efficient design for semantic segmentation with transformers. Adv. Neural. Inf. Process. Syst. 34, 12077\u201312090 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"14_CR76","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: Groupvit: semantic segmentation emerges from text supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18134\u201318144 (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"14_CR77","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., De\u00a0Mello, S.: Open-vocabulary panoptic segmentation with text-to-image diffusion models. arXiv preprint arXiv:2303.04803 (2023)","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"14_CR78","doi-asserted-by":"crossref","unstructured":"Xu, M., et al.: A simple baseline for zero-shot semantic segmentation with pre-trained vision-language model. In: ECCV, pp. 736\u2013753 (2022)","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"14_CR79","doi-asserted-by":"crossref","unstructured":"Yang, Z., Wang, J., Tang, Y., Chen, K., Zhao, H., Torr, P.H.: LAVT: language-aware vision transformer for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18155\u201318165 (2022)","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"14_CR80","doi-asserted-by":"crossref","unstructured":"Ye, L., Rochan, M., Liu, Z., Wang, Y.: Cross-modal self-attention network for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10502\u201310511 (2019)","DOI":"10.1109\/CVPR.2019.01075"},{"key":"14_CR81","doi-asserted-by":"crossref","unstructured":"Yu, F.X., Cao, L., Feris, R.S., Smith, J.R., Chang, S.F.: Designing category-level attributes for discriminative visual recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 771\u2013778 (2013)","DOI":"10.1109\/CVPR.2013.105"},{"key":"14_CR82","doi-asserted-by":"crossref","unstructured":"Yu, F., et al.: Bdd100k: a diverse driving dataset for heterogeneous multitask learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2636\u20132645 (2020)","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"14_CR83","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: A simple framework for open-vocabulary segmentation and detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1020\u20131031 (2023)","DOI":"10.1109\/ICCV51070.2023.00100"},{"key":"14_CR84","unstructured":"Zhang, P., Xiao, S., Liu, Z., Dou, Z., Nie, J.Y.: Retrieve anything to augment large language models. arXiv preprint arXiv:2310.07554 (2023)"},{"key":"14_CR85","unstructured":"Zheng\u00a0Ding, Jieke\u00a0Wang, Z.T.: Open-vocabulary universal image segmentation with maskclip. In: International Conference on Machine Learning (2023)"},{"key":"14_CR86","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2921\u20132929 (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"14_CR87","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ade20k dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 633\u2013641 (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"14_CR88","doi-asserted-by":"crossref","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from clip. In: European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"14_CR89","unstructured":"Zhou, Q., Liu, Y., Yu, C., Li, J., Wang, Z., Wang, F.: LMSeg: language-guided multi-dataset segmentation. In: International Conference on Learning Representations (2023)"},{"key":"14_CR90","doi-asserted-by":"crossref","unstructured":"Zhu, C., Chen, F., Ahmed, U., Shen, Z., Savvides, M.: Semantic relation reasoning for shot-stable few-shot object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8782\u20138791 (2021)","DOI":"10.1109\/CVPR46437.2021.00867"},{"key":"14_CR91","doi-asserted-by":"crossref","unstructured":"Zou, X., et\u00a0al.: Generalized decoding for pixel, image, and language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15116\u201315127 (2023)","DOI":"10.1109\/CVPR52729.2023.01451"},{"key":"14_CR92","doi-asserted-by":"crossref","unstructured":"Zweig, A., Weinshall, D.: Exploiting object hierarchy: combining models from different category levels. In: 2007 IEEE 11th International Conference on Computer Vision, pp.\u00a01\u20138. IEEE (2007)","DOI":"10.1109\/ICCV.2007.4409064"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72920-1_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:53:30Z","timestamp":1732830810000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72920-1_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"ISBN":["9783031729195","9783031729201"],"references-count":92,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72920-1_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"assertion":[{"value":"1 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}