{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:17:57Z","timestamp":1772907477708,"version":"3.50.1"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031200434","type":"print"},{"value":"9783031200441","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20044-1_16","type":"book-chapter","created":{"date-parts":[[2022,10,19]],"date-time":"2022-10-19T23:12:10Z","timestamp":1666221130000},"page":"275-292","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":48,"title":["Open-World Semantic Segmentation via\u00a0Contrasting and\u00a0Clustering Vision-Language Embedding"],"prefix":"10.1007","author":[{"given":"Quande","family":"Liu","sequence":"first","affiliation":[]},{"given":"Youpeng","family":"Wen","sequence":"additional","affiliation":[]},{"given":"Jianhua","family":"Han","sequence":"additional","affiliation":[]},{"given":"Chunjing","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Hang","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Xiaodan","family":"Liang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,20]]},"reference":[{"key":"16_CR1","first-page":"468","volume":"32","author":"M Bucher","year":"2019","unstructured":"Bucher, M., Tuan-Hung, V., Cord, M., P\u00e9rez, P.: Zero-shot semantic segmentation. Adv. Neural. Inf. Process. Syst. 32, 468\u2013479 (2019)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., Ferrari, V.: COCO-Stuff: thing and stuff classes in context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1209\u20131218 (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"16_CR3","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. arXiv preprint arXiv:2006.09882 (2020)"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12\u00a0m: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"issue":"4","key":"16_CR5","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"L-C Chen","year":"2017","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: DeepLab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"16_CR6","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Chen, X., Shrivastava, A., Gupta, A.: Neil: extracting visual knowledge from web data. In: 2013 IEEE International Conference on Computer Vision, pp. 1409\u20131416 (2013)","DOI":"10.1109\/ICCV.2013.178"},{"key":"16_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C., et al.: UNITER: universal image-text representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Cheng, J., Nandi, S., Natarajan, P., Abd-Almageed, W.: Sign: spatial-information incorporated generative network for generalized zero-shot semantic segmentation (2021)","DOI":"10.1109\/ICCV48922.2021.00942"},{"issue":"2","key":"16_CR10","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The pascal visual object classes (VOC) challenge. Int. J. Comput. Vis. 88(2), 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vis."},{"issue":"10","key":"16_CR11","doi-asserted-by":"publisher","first-page":"3614","DOI":"10.1109\/TPAMI.2020.2981604","volume":"43","author":"C Geng","year":"2020","unstructured":"Geng, C., Huang, S., Chen, S.: Recent advances in open set recognition: a survey. IEEE Trans. Pattern Anal. Mach. Intell. 43(10), 3614\u20133631 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"16_CR12","unstructured":"Gu, X., Lin, T.-Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation (2021)"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Gu, Z., Zhou, S., Niu, L., Zhao, Z., Zhang, L.: Context-aware feature generation for zero-shot semantic segmentation. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1921\u20131929 (2020)","DOI":"10.1145\/3394171.3413593"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"16_CR15","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Huang, Z., Zeng, Z., Huang, Y., Liu, B., Fu, D., Fu, J.: Seeing out of the box: end-to-end pre-training for vision-language representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12976\u201312985 (2021)","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"16_CR17","unstructured":"Huo, Y., et al.: WenLan: bridging vision and language by large-scale multi-modal pre-training. arXiv preprint arXiv:2103.06561 (2021)"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Hwang, J.-J., et al.: SegSort: segmentation by discriminative sorting of segments. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7334\u20137344 (2019)","DOI":"10.1109\/ICCV.2019.00743"},{"key":"16_CR19","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. arXiv preprint arXiv:2102.05918 (2021)"},{"key":"16_CR20","unstructured":"Jocher, G.: ultralytics\/yolov5: v3.1 - Bug Fixes and Performance Improvements, October 2020. https:\/\/github.com\/ultralytics\/yolov5"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Kato, N., Yamasaki, T., Aizawa, K.: Zero-shot semantic segmentation via variational mapping. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00172"},{"issue":"6","key":"16_CR22","first-page":"90","volume":"1","author":"TM Kodinariya","year":"2013","unstructured":"Kodinariya, T.M., Makwana, P.R.: Review on determining number of cluster in k-means clustering. Int. J. 1(6), 90\u201395 (2013)","journal-title":"Int. J."},{"key":"16_CR23","unstructured":"Li, P., Wei, Y., Yang, Y.: Consistent structural relation learning for zero-shot segmentation. In: Advances in Neural Information Processing Systems, vol. 33 (2020)"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Li, W., et al.: Unimo: towards unified-modal understanding and generation via cross-modal contrastive learning (2021)","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"16_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Lin, G., Milan, A., Shen, C., Reid, I.: RefineNet: multi-path refinement networks for high-resolution semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1925\u20131934 (2017)","DOI":"10.1109\/CVPR.2017.549"},{"key":"16_CR27","unstructured":"Lin, J., et al.: M6: a Chinese multimodal pretrainer. arXiv preprint arXiv:2103.00823 (2021)"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"16_CR29","unstructured":"Loshchilov, I., Hutter, F.: Fixing weight decay regularization in Adam (2018)"},{"issue":"7","key":"16_CR30","first-page":"3523","volume":"44","author":"S Minaee","year":"2021","unstructured":"Minaee, S., Boykov, Y.Y., Porikli, F., Plaza, A.J., Kehtarnavaz, N., Terzopoulos, D.: Image segmentation using deep learning: a survey. IEEE Trans. Pattern Anal. Mach. Intell. 44(7), 3523\u20133542 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., et al.: The role of context for object detection and semantic segmentation in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 891\u2013898 (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Oza, P., Patel, V.M.: C2AE: class conditioned auto-encoder for open-set recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2307\u20132316 (2019)","DOI":"10.1109\/CVPR.2019.00241"},{"key":"16_CR33","unstructured":"Pakhomov, D., Hira, S., Wagle, N., Green, K.E., Navab, N.: Segmentation in style: unsupervised semantic image segmentation with styleGAN and clip. arXiv preprint arXiv:2107.12518 (2021)"},{"issue":"6","key":"16_CR34","doi-asserted-by":"publisher","first-page":"1191","DOI":"10.1162\/089976603321780272","volume":"15","author":"L Paninski","year":"2003","unstructured":"Paninski, L.: Estimation of entropy and mutual information. Neural Comput. 15(6), 1191\u20131253 (2003)","journal-title":"Neural Comput."},{"key":"16_CR35","doi-asserted-by":"crossref","unstructured":"Perera, P., et al.: Generative-discriminative feature representations for open-set recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11814\u201311823 (2020)","DOI":"10.1109\/CVPR42600.2020.01183"},{"key":"16_CR36","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. arXiv preprint arXiv:2103.00020 (2021)"},{"key":"16_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"issue":"7","key":"16_CR38","doi-asserted-by":"publisher","first-page":"1757","DOI":"10.1109\/TPAMI.2012.256","volume":"35","author":"WJ Scheirer","year":"2012","unstructured":"Scheirer, W.J., de Rezende Rocha, A., Sapkota, A., Boult, T.E.: Toward open set recognition. IEEE Trans. Pattern Anal. Mach. Intell. 35(7), 1757\u20131772 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"16_CR39","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530 (2019)"},{"issue":"2","key":"16_CR40","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., et al.: YFCC100M: the new data in multimedia research. Commun. ACM 59(2), 64\u201373 (2016)","journal-title":"Commun. ACM"},{"key":"16_CR41","unstructured":"Tschannen, M., Djolonga, J., Rubenstein, P.K., Gelly, S., Lucic, M.: On mutual information maximization for representation learning. arXiv preprint arXiv:1907.13625 (2019)"},{"key":"16_CR42","doi-asserted-by":"crossref","unstructured":"Van Gansbeke, W., Vandenhende, S., Georgoulis, S., Van Gool, L.: Unsupervised semantic segmentation by contrasting object mask proposals. arXiv preprint arXiv:2102.06191 (2021)","DOI":"10.1109\/ICCV48922.2021.00990"},{"key":"16_CR43","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: SimVLM: simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904 (2021)"},{"key":"16_CR44","doi-asserted-by":"crossref","unstructured":"Xian, Y., Choudhury, S., He, Y., Schiele, B., Akata, Z.: Semantic projection network for zero- and few-label semantic segmentation. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8248\u20138257 (2019)","DOI":"10.1109\/CVPR.2019.00845"},{"key":"16_CR45","doi-asserted-by":"crossref","unstructured":"Xian, Y., Choudhury, S., He, Y., Schiele, B., Akata, Z.: Semantic projection network for zero-and few-label semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8256\u20138265 (2019)","DOI":"10.1109\/CVPR.2019.00845"},{"key":"16_CR46","unstructured":"Xie, J., Zheng, S.: ZSD-YOLO: zero-shot yolo detection using vision-language knowledge distillation (2021)"},{"key":"16_CR47","doi-asserted-by":"crossref","unstructured":"Xu, M., et al.: A simple baseline for zero-shot semantic segmentation with pre-trained vision-language model. arXiv preprint arXiv:2112.14757 (2021)","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"16_CR48","doi-asserted-by":"crossref","unstructured":"Ye, M., Zhang, X., Yuen, P.C., Chang, S.-F.: Unsupervised embedding learning via invariant and spreading instance feature. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6210\u20136219 (2019)","DOI":"10.1109\/CVPR.2019.00637"},{"key":"16_CR49","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K.D., Hu, D.H., Chang, S.-F.: Open-vocabulary object detection using captions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14393\u201314402 (2021)","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"16_CR50","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., Jia, J.: Pyramid scene parsing network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2881\u20132890 (2017)","DOI":"10.1109\/CVPR.2017.660"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20044-1_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,13]],"date-time":"2024-03-13T19:50:14Z","timestamp":1710359414000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20044-1_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031200434","9783031200441"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20044-1_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"20 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}