{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,28]],"date-time":"2026-07-28T14:44:41Z","timestamp":1785249881430,"version":"3.55.0"},"publisher-location":"Cham","reference-count":122,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729393","type":"print"},{"value":"9783031729409","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,17]],"date-time":"2024-11-17T00:00:00Z","timestamp":1731801600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,17]],"date-time":"2024-11-17T00:00:00Z","timestamp":1731801600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72940-9_9","type":"book-chapter","created":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T20:43:17Z","timestamp":1731789797000},"page":"143-164","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":17,"title":["In Defense of\u00a0Lazy Visual Grounding for\u00a0Open-Vocabulary Semantic Segmentation"],"prefix":"10.1007","author":[{"given":"Dahyun","family":"Kang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minsu","family":"Cho","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,11,17]]},"reference":[{"key":"9_CR1","unstructured":"Amir, S., Gandelsman, Y., Bagon, S., Dekel, T.: Deep VIT features as dense visual descriptors. arXiv preprint arXiv:2112.05814 2(3), 4 (2021)"},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Arbelaez, P., Maire, M., Fowlkes, C., Malik, J.: From contours to regions: an empirical evaluation. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 2294\u20132301. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206707"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Blake, A., Kohli, P., Rother, C.: Markov Random Fields for Vision and Image Processing. MIT Press (2011)","DOI":"10.7551\/mitpress\/8579.001.0001"},{"issue":"11","key":"9_CR4","doi-asserted-by":"publisher","first-page":"1222","DOI":"10.1109\/34.969114","volume":"23","author":"Y Boykov","year":"2001","unstructured":"Boykov, Y., Veksler, O., Zabih, R.: Fast approximate energy minimization via graph cuts. IEEE Trans. Pattern Anal. Mach. Intell. 23(11), 1222\u20131239 (2001)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"9_CR5","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. Adv. Neural Inf. Process. Syst. (NeurIPS) (2020)"},{"key":"9_CR6","unstructured":"Bucher, M., Vu, T.H., Cord, M., P\u00e9rez, P.: Zero-shot semantic segmentation. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Cai, K., et al.: Mixreorg: cross-modal mixed patch reorganization is a good mask learner for open-world semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1196\u20131205 (2023)","DOI":"10.1109\/ICCV51070.2023.00116"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Cai, Z., Vasconcelos, N.: Cascade r-cnn: delving into high quality object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6154\u20136162 (2018)","DOI":"10.1109\/CVPR.2018.00644"},{"key":"9_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1007\/978-3-030-01264-9_9","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M Caron","year":"2018","unstructured":"Caron, M., Bojanowski, P., Joulin, A., Douze, M.: Deep clustering for unsupervised learning of visual features. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) Computer Vision \u2013 ECCV 2018. LNCS, vol. 11218, pp. 139\u2013156. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_9"},{"key":"9_CR10","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. Adv. Neural Inf. Process. Syst. (2020)"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"9_CR12","doi-asserted-by":"crossref","unstructured":"Cha, J., Mun, J., Roh, B.: Learning to generate text-grounded mask for open-world semantic segmentation from only image-text pairs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01074"},{"key":"9_CR13","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12m: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"issue":"12","key":"9_CR14","doi-asserted-by":"publisher","first-page":"1673","DOI":"10.1109\/83.730379","volume":"7","author":"CW Chen","year":"1998","unstructured":"Chen, C.W., Luo, J., Parker, K.J.: Image segmentation via adaptive k-mean clustering and knowledge-based morphological operations with biomedical applications. IEEE Trans. Image Process. 7(12), 1673\u20131683 (1998)","journal-title":"IEEE Trans. Image Process."},{"key":"9_CR15","doi-asserted-by":"crossref","unstructured":"Chen, J., et al.: Exploring open-vocabulary semantic segmentation from clip vision encoder distillation only. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00071"},{"key":"9_CR16","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. In: IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI) (2017)","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"9_CR17","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"9_CR19","unstructured":"Cheng, B., Schwing, A.G., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation (2021)"},{"key":"9_CR20","doi-asserted-by":"crossref","unstructured":"Cho, M., Kwak, S., Schmid, C., Ponce, J.: Unsupervised object discovery and localization in the wild: part-based matching with bottom-up region proposals. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1201\u20131210 (2015)","DOI":"10.1109\/CVPR.2015.7298724"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Cho, S., et al.: Cat-seg: cost aggregation for open-vocabulary semantic segmentation (2023)","DOI":"10.1109\/CVPR52733.2024.00394"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Choi, S., Kang, D., Cho, M.: Contrastive mean-shift learning for generalized category discovery. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.02179"},{"key":"9_CR23","unstructured":"Contributors, M.: MMSegmentation: openmmlab semantic segmentation toolbox and benchmark. https:\/\/github.com\/open-mmlab\/mmsegmentation (2020)"},{"key":"9_CR24","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"9_CR25","unstructured":"Ding, Z., Wang, J., Tu, Z.: Open-vocabulary universal image segmentation with maskclip. In: Proceedings of the International Conference on Machine Learning (ICML) (2023)"},{"key":"9_CR26","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: Proceedings of the International Conference on Learning Representations (ICLR) (2021)"},{"issue":"2","key":"9_CR27","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The Pascal visual object classes (voc) challenge. Int. J. Comput. Vision 88(2), 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vision"},{"key":"9_CR28","doi-asserted-by":"publisher","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.-Y.: Scaling open-vocabulary image segmentation with\u00a0image-level labels. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXXVI, pp. 540\u2013557. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_31","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"9_CR29","doi-asserted-by":"publisher","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.-Y.: Scaling open-vocabulary image segmentation with\u00a0image-level labels. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXXVI, pp. 540\u2013557. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_31","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Grauman, K., Darrell, T.: Unsupervised learning of categories from sets of partially matching image features. In: 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR 2006), vol.\u00a01, pp. 19\u201325. IEEE (2006)","DOI":"10.1109\/CVPR.2006.322"},{"key":"9_CR31","unstructured":"Grill, J.B., et\u00a0al.: Bootstrap your own latent-a new approach to self-supervised learning. Adv. Neural Inf. Process. Syst. (2020)"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Gu, Z., Zhou, S., Niu, L., Zhao, Z., Zhang, L.: Context-aware feature generation for zero-shot semantic segmentation. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1921\u20131929 (2020)","DOI":"10.1145\/3394171.3413593"},{"key":"9_CR33","unstructured":"Gui, J., Chen, T., Cao, Q., Sun, Z., Luo, H., Tao, D.: A survey of self-supervised learning from multiple perspectives: algorithms, theory, applications and future trends. arXiv preprint arXiv:2301.05712 (2023)"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: Lvis: a dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"9_CR35","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"9_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"502","DOI":"10.1007\/978-3-030-01228-1_30","volume-title":"Computer Vision \u2013 ECCV 2018","author":"K-J Hsu","year":"2018","unstructured":"Hsu, K.-J., Tsai, C.-C., Lin, Y.-Y., Qian, X., Chuang, Y.-Y.: Unsupervised CNN-based co-saliency detection with graphical optimization. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11209, pp. 502\u2013518. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01228-1_30"},{"key":"9_CR37","doi-asserted-by":"crossref","unstructured":"Huang, S., et al.: VOP: text-video co-operative prompt tuning for cross-modal retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6565\u20136574 (2023)","DOI":"10.1109\/CVPR52729.2023.00635"},{"key":"9_CR38","unstructured":"Iscen, A., Caron, M., Fathi, A., Schmid, C.: Retrieval-enhanced contrastive vision-text models. Adv. Neural Inf. Process. Syst. (2023)"},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Jing, L., Tian, Y.: Self-supervised visual feature learning with deep neural networks: a survey. In: IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI) (2020)","DOI":"10.1109\/TPAMI.2020.2992393"},{"key":"9_CR40","doi-asserted-by":"crossref","unstructured":"Kang, D., Cho, M.: Integrative few-shot learning for classification and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9979\u20139990 (2022)","DOI":"10.1109\/CVPR52688.2022.00974"},{"key":"9_CR41","doi-asserted-by":"crossref","unstructured":"Kang, D., Koniusz, P., Cho, M., Murray, N.: Distilling self-supervised vision transformers for weakly-supervised few-shot classification and segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01880"},{"key":"9_CR42","doi-asserted-by":"crossref","unstructured":"Karazija, L., Laina, I., Vedaldi, A., Rupprecht, C.: Diffusion models for zero-shot open-vocabulary segmentation. arXiv preprint arXiv:2306.09316 (2023)","DOI":"10.1007\/978-3-031-72652-1_18"},{"issue":"4","key":"9_CR43","doi-asserted-by":"publisher","first-page":"321","DOI":"10.1007\/BF00133570","volume":"1","author":"M Kass","year":"1988","unstructured":"Kass, M., Witkin, A., Terzopoulos, D.: Snakes: active contour models. Int. J. Comput. Vision 1(4), 321\u2013331 (1988)","journal-title":"Int. J. Comput. Vision"},{"key":"9_CR44","unstructured":"Kenton, J.D.M.W.C., Toutanova, L.K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT (2019)"},{"key":"9_CR45","unstructured":"Kim, S., Kang, M., Park, J.: Risclip: referring image segmentation framework using clip. arXiv preprint arXiv:2306.08498 (2023)"},{"key":"9_CR46","doi-asserted-by":"crossref","unstructured":"Kirillov, A., He, K., Girshick, R., Rother, C., Doll\u00e1r, P.: Panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9404\u20139413 (2019)","DOI":"10.1109\/CVPR.2019.00963"},{"key":"9_CR47","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Levinkov, E., Andres, B., Savchynskyy, B., Rother, C.: Instancecut: from edges to instances with multicut. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5008\u20135017 (2017)","DOI":"10.1109\/CVPR.2017.774"},{"key":"9_CR48","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"9_CR49","unstructured":"Kr\u00e4henb\u00fchl, P., Koltun, V.: Efficient inference in fully connected CRFs with Gaussian edge potentials. Adv. Neural Inf. Process. Syst. 24 (2011)"},{"key":"9_CR50","doi-asserted-by":"crossref","unstructured":"Lee, Y.J., Grauman, K.: Shape discovery from unlabeled image collections. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 2254\u20132261. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206698"},{"key":"9_CR51","doi-asserted-by":"crossref","unstructured":"Li, B., Shi, Y., Qi, Z., Chen, Z.: A survey on semantic segmentation. In: 2018 IEEE International Conference on Data Mining Workshops (ICDMW), pp. 1233\u20131240. IEEE (2018)","DOI":"10.1109\/ICDMW.2018.00176"},{"key":"9_CR52","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=RriDjddCLN"},{"key":"9_CR53","doi-asserted-by":"crossref","unstructured":"Li, Z., Chen, J.: Superpixel segmentation using linear spectral clustering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1356\u20131363 (2015)","DOI":"10.1109\/CVPR.2015.7298741"},{"key":"9_CR54","doi-asserted-by":"crossref","unstructured":"Li, Z., Zhou, Q., Zhang, X., Zhang, Y., Wang, Y., Xie, W.: Open-vocabulary object segmentation with diffusion models. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00705"},{"key":"9_CR55","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted clip. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"9_CR56","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft coco: common objects in context. In: Proceedings of the European Conference on Computer Vision (ECCV) (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"9_CR57","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"9_CR58","doi-asserted-by":"crossref","unstructured":"Liu, H., et al.: Learning customized visual models with retrieval-augmented knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15148\u201315158 (2023)","DOI":"10.1109\/CVPR52729.2023.01454"},{"key":"9_CR59","doi-asserted-by":"crossref","unstructured":"Liu, Q., Wen, Y., Han, J., Xu, C., Xu, H., Liang, X.: Open-world semantic segmentation via contrasting and clustering vision-language embedding. In: Proceedings of the European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-20044-1_16"},{"key":"9_CR60","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"9_CR61","unstructured":"Luo, H., Bao, J., Wu, Y., He, X., Li, T.: Segclip: patch aggregation with learnable centers for open-vocabulary semantic segmentation. In: Proceedings of the International Conference on Machine Learning (ICML) (2023)"},{"key":"9_CR62","doi-asserted-by":"crossref","unstructured":"Ma, H., et al.: Ei-clip: entity-aware interventional contrastive learning for e-commerce cross-modal retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18051\u201318061 (2022)","DOI":"10.1109\/CVPR52688.2022.01752"},{"key":"9_CR63","unstructured":"Ma, T., et al.: A simple long-tailed recognition baseline via vision-language model. arXiv preprint arXiv:2111.14745 (2021)"},{"key":"9_CR64","unstructured":"Ma, T., et al.: Unleashing the potential of vision-language models for long-tailed visual recognition (2022)"},{"key":"9_CR65","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., et al.: The role of context for object detection and semantic segmentation in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"9_CR66","doi-asserted-by":"crossref","unstructured":"Noh, H., Hong, S., Han, B.: Learning deconvolution network for semantic segmentation. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.178"},{"key":"9_CR67","unstructured":"Paszke, A., et al.: Automatic differentiation in pytorch. In: Advances in Neural Information Processing Systems (NeurIPS) Workshop Autodiff (2017)"},{"key":"9_CR68","doi-asserted-by":"crossref","unstructured":"Pizer, S.M., et al.: Adaptive histogram equalization and its variations. Comput. Vision Graph. Image Process. 39(3), 355\u2013368 (1987)","DOI":"10.1016\/S0734-189X(87)80186-X"},{"key":"9_CR69","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning (ICML) (2021)"},{"key":"9_CR70","doi-asserted-by":"crossref","unstructured":"Rambhatla, S.S., Misra, I., Chellappa, R., Shrivastava, A.: Most: multiple object localization with self-supervised transformers for object discovery. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01450"},{"key":"9_CR71","doi-asserted-by":"crossref","unstructured":"Ranasinghe, K., McKinzie, B., Ravi, S., Yang, Y., Toshev, A., Shlens, J.: Perceptual grouping in contrastive vision-language models. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00513"},{"key":"9_CR72","unstructured":"Ren, P., et al.: Viewco: discovering text-supervised segmentation masks via multi-view semantic consistency. arXiv preprint arXiv:2302.10307 (2023)"},{"key":"9_CR73","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models (2021)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"9_CR74","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: International Conference on Medical Image Computing and Computer-Assisted Intervention (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"9_CR75","doi-asserted-by":"crossref","unstructured":"Russakovsky, O., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vision 115(3), 211\u2013252 (2015)","DOI":"10.1007\/s11263-015-0816-y"},{"key":"9_CR76","doi-asserted-by":"crossref","unstructured":"Russell, B.C., Freeman, W.T., Efros, A.A., Sivic, J., Zisserman, A.: Using multiple segmentations to discover objects and their extent in image collections. In: 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR 2006), vol.\u00a02, pp. 1605\u20131614. IEEE (2006)","DOI":"10.1109\/CVPR.2006.326"},{"key":"9_CR77","unstructured":"Seitzer, M., et\u00a0al.: Bridging the gap to real-world object-centric learning. arXiv preprint arXiv:2209.14860 (2022)"},{"issue":"8","key":"9_CR78","doi-asserted-by":"publisher","first-page":"888","DOI":"10.1109\/34.868688","volume":"22","author":"J Shi","year":"2000","unstructured":"Shi, J., Malik, J.: Normalized cuts and image segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 22(8), 888\u2013905 (2000)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"9_CR79","doi-asserted-by":"crossref","unstructured":"Shin, G., Albanie, S., Xie, W.: Unsupervised salient object detection with spectral cluster voting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3971\u20133980 (2022)","DOI":"10.1109\/CVPRW56347.2022.00442"},{"key":"9_CR80","unstructured":"Shin, G., Xie, W., Albanie, S.: Reco: retrieve and co-segment for zero-shot transfer. Adv. Neural Inf. Process. Syst. (2022)"},{"key":"9_CR81","doi-asserted-by":"crossref","unstructured":"Shtedritski, A., Rupprecht, C., Vedaldi, A.: What does clip know about a red circle? Visual prompt engineering for VLMs. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"9_CR82","unstructured":"Sim\u00e9oni, O., et al.: Localizing objects with self-supervised transformers and no labels. In: Proceedings of the British Machine Vision Conference (BMVC) (2021)"},{"key":"9_CR83","doi-asserted-by":"crossref","unstructured":"Sim\u00e9oni, O., Sekkat, C., Puy, G., Vobeck\u1ef3, A., Zablocki, \u00c9., P\u00e9rez, P.: Unsupervised object localization: observing the background to discover objects. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3176\u20133186 (2023)","DOI":"10.1109\/CVPR52729.2023.00310"},{"key":"9_CR84","doi-asserted-by":"crossref","unstructured":"Sim\u00e9oni, O., Zablocki, \u00c9., Gidaris, S., Puy, G., P\u00e9rez, P.: Unsupervised object localization in the era of self-supervised VITs: a survey. arXiv preprint arXiv:2310.12904 (2023)","DOI":"10.1007\/s11263-024-02167-8"},{"key":"9_CR85","unstructured":"Singh, S., Deshmukh, S., Sarkar, M., Krishnamurthy, B.: Locate: self-supervised object discovery via flow-guided graph-cut and bootstrapped self-training. In: Proceedings of the British Machine Vision Conference (BMVC) (2023)"},{"key":"9_CR86","doi-asserted-by":"crossref","unstructured":"Sivic, J., Russell, B.C., Efros, A.A., Zisserman, A., Freeman, W.T.: Discovering objects and their location in images. In: Tenth IEEE International Conference on Computer Vision (ICCV 2005), vol.\u00a01, pp. 370\u2013377. IEEE (2005)","DOI":"10.1109\/ICCV.2005.77"},{"key":"9_CR87","unstructured":"Snell, J., Swersky, K., Zemel, R.: Prototypical networks for few-shot learning. Adv. Neural Inf. Process. Syst. (2017)"},{"key":"9_CR88","unstructured":"Thoma, M.: A survey of semantic segmentation. arXiv preprint arXiv:1602.06541 (2016)"},{"key":"9_CR89","doi-asserted-by":"crossref","unstructured":"Thomee, B., et al.: Yfcc100m: the new data in multimedia research. Commun. ACM 59(2), 64\u201373 (2016)","DOI":"10.1145\/2812802"},{"key":"9_CR90","doi-asserted-by":"crossref","unstructured":"Tian, C., Wang, W., Zhu, X., Dai, J., Qiao, Y.: Vl-ltr: learning class-wise visual-linguistic representation for long-tailed visual recognition. In: European Conference on Computer Vision. Springer (2022)","DOI":"10.1007\/978-3-031-19806-9_5"},{"key":"9_CR91","doi-asserted-by":"crossref","unstructured":"Tuytelaars, T., Lampert, C.H., Blaschko, M.B., Buntine, W.: Unsupervised object discovery: a comparison. Int. J. Comput. Vision (2010)","DOI":"10.1007\/s11263-009-0271-8"},{"key":"9_CR92","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. (2017)"},{"key":"9_CR93","doi-asserted-by":"crossref","unstructured":"Vo, H.V., et al.: Unsupervised image matching and object discovery as optimization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8287\u20138296 (2019)","DOI":"10.1109\/CVPR.2019.00848"},{"key":"9_CR94","doi-asserted-by":"crossref","unstructured":"Vo, H.V., P\u00e9rez, P., Ponce, J.: Toward unsupervised, multi-object discovery in large-scale image collections. In: ECCV 2020, Part XXIII 16, pp. 779\u2013795. Springer (2020)","DOI":"10.1007\/978-3-030-58592-1_46"},{"key":"9_CR95","doi-asserted-by":"crossref","unstructured":"Wang, F., Mei, J., Yuille, A.: Sclip: rethinking self-attention for dense vision-language inference. arXiv preprint arXiv:2312.01597 (2023)","DOI":"10.1007\/978-3-031-72664-4_18"},{"key":"9_CR96","unstructured":"Wang, J., et al.: Diffusion model is secretly a training-free open vocabulary semantic segmenter. arXiv preprint arXiv:2309.02773 (2023)"},{"key":"9_CR97","doi-asserted-by":"crossref","unstructured":"Wang, K., Liew, J.H., Zou, Y., Zhou, D., Feng, J.: Panet: few-shot image semantic segmentation with prototype alignment. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00929"},{"key":"9_CR98","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Freesolo: learning to segment objects without annotations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14176\u201314186 (2022)","DOI":"10.1109\/CVPR52688.2022.01378"},{"key":"9_CR99","doi-asserted-by":"crossref","unstructured":"Wang, X., Girdhar, R., Yu, S.X., Misra, I.: Cut and learn for unsupervised object detection and instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3124\u20133134 (2023)","DOI":"10.1109\/CVPR52729.2023.00305"},{"key":"9_CR100","doi-asserted-by":"crossref","unstructured":"Wang, Y., Shen, X., Hu, S.X., Yuan, Y., Crowley, J.L., Vaufreydaz, D.: Self-supervised transformers for unsupervised object discovery using normalized cut. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14543\u201314553 (2022)","DOI":"10.1109\/CVPR52688.2022.01414"},{"key":"9_CR101","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Cris: clip-driven referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11686\u201311695 (2022)","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"9_CR102","doi-asserted-by":"crossref","unstructured":"Weber, M., Welling, M., Perona, P.: Towards automatic discovery of object categories. In: Proceedings IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2000 (Cat. No. PR00662), vol.\u00a02, pp. 101\u2013108. IEEE (2000)","DOI":"10.1109\/CVPR.2000.854754"},{"key":"9_CR103","doi-asserted-by":"crossref","unstructured":"Wei, Y., et al.: iclip: bridging image classification and contrastive language-image pre-training for visual recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00272"},{"key":"9_CR104","doi-asserted-by":"crossref","unstructured":"Wu, Z., Leahy, R.: An optimal graph theoretic approach to data clustering: theory and its application to image segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 15(11), 1101\u20131113 (1993)","DOI":"10.1109\/34.244673"},{"key":"9_CR105","doi-asserted-by":"crossref","unstructured":"Wu, Z., Xiong, Y., Yu, S.X., Lin, D.: Unsupervised feature learning via non-parametric instance discrimination. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00393"},{"key":"9_CR106","unstructured":"Wysocza\u0144ska, M., Sim\u00e9oni, O., Ramamonjisoa, M., Bursuc, A., Trzci\u0144ski, T., P\u00e9rez, P.: Clip-dinoiser: teaching clip a few dino tricks. arXiv preprint arXiv:2312.12359 (2023)"},{"key":"9_CR107","doi-asserted-by":"crossref","unstructured":"Xian, Y., Choudhury, S., He, Y., Schiele, B., Akata, Z.: Semantic projection network for zero-and few-label semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8256\u20138265 (2019)","DOI":"10.1109\/CVPR.2019.00845"},{"key":"9_CR108","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: Groupvit: semantic segmentation emerges from text supervision. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"9_CR109","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., De\u00a0Mello, S.: Open-vocabulary panoptic segmentation with text-to-image diffusion models. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"9_CR110","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: Learning open-vocabulary semantic segmentation models from natural language supervision. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00287"},{"key":"9_CR111","doi-asserted-by":"publisher","unstructured":"Xu, M., et al.: A simple baseline for open-vocabulary semantic segmentation with pre-trained vision-language model. In: Proceedings of the European Conference on Computer Vision (ECCV). Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-19818-2_42","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"9_CR112","doi-asserted-by":"crossref","unstructured":"Yin, Z., et al.: Transfgu: a top-down approach to fine-grained unsupervised semantic segmentation. In: Proceedings of the European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-19818-2_5"},{"key":"9_CR113","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"9_CR114","unstructured":"Yuan, L., et\u00a0al.: Florence: a new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"9_CR115","unstructured":"Zadaianchuk, A., Kleindessner, M., Zhu, Y., Locatello, F., Brox, T.: Unsupervised semantic segmentation with self-supervised object-centric representations. In: Proceedings of the International Conference on Learning Representations (ICLR) (2023)"},{"key":"9_CR116","doi-asserted-by":"crossref","unstructured":"Zhai, X., et al.: Lit: zero-shot transfer with locked-image text tuning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"9_CR117","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Wu, Y.N., Zhu, S.C.: Mining and-or graphs for graph matching and object discovery. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 55\u201363 (2015)","DOI":"10.1109\/ICCV.2015.15"},{"key":"9_CR118","doi-asserted-by":"crossref","unstructured":"Zhou, B., et al.: Semantic understanding of scenes through the ade20k dataset. Int. J. Comput. Vision (2019)","DOI":"10.1007\/s11263-018-1140-0"},{"key":"9_CR119","doi-asserted-by":"crossref","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from clip. In: Proceedings of the European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-19815-1_40"},{"issue":"4","key":"9_CR120","doi-asserted-by":"publisher","first-page":"862","DOI":"10.1109\/TPAMI.2014.2353617","volume":"37","author":"JY Zhu","year":"2014","unstructured":"Zhu, J.Y., Wu, J., Xu, Y., Chang, E., Tu, Z.: Unsupervised object class discovery via saliency-guided multiple class learning. IEEE Trans. Pattern Anal. Mach. Intell. 37(4), 862\u2013875 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"9_CR121","doi-asserted-by":"crossref","unstructured":"Zhuang, C., Zhai, A.L., Yamins, D.: Local aggregation for unsupervised learning of visual embeddings. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00610"},{"key":"9_CR122","doi-asserted-by":"crossref","unstructured":"Ziegler, A., Asano, Y.M.: Self-supervised learning of object parts for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01410"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72940-9_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T21:32:56Z","timestamp":1731792776000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72940-9_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,17]]},"ISBN":["9783031729393","9783031729409"],"references-count":122,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72940-9_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,17]]},"assertion":[{"value":"17 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}