{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:20:35Z","timestamp":1775578835141,"version":"3.50.1"},"publisher-location":"Cham","reference-count":71,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031731945","type":"print"},{"value":"9783031731952","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73195-2_27","type":"book-chapter","created":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T09:35:45Z","timestamp":1732613745000},"page":"467-484","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":70,"title":["Segment and\u00a0Recognize Anything at\u00a0Any Granularity"],"prefix":"10.1007","author":[{"given":"Feng","family":"Li","sequence":"first","affiliation":[]},{"given":"Hao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Peize","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Xueyan","family":"Zou","sequence":"additional","affiliation":[]},{"given":"Shilong","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Chunyuan","family":"Li","sequence":"additional","affiliation":[]},{"given":"Jianwei","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Lei","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jianfeng","family":"Gao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,27]]},"reference":[{"issue":"5","key":"27_CR1","doi-asserted-by":"publisher","first-page":"898","DOI":"10.1109\/TPAMI.2010.161","volume":"33","author":"P Arbelaez","year":"2010","unstructured":"Arbelaez, P., Maire, M., Fowlkes, C., Malik, J.: Contour detection and hierarchical image segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 33(5), 898\u2013916 (2010)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"27_CR2","doi-asserted-by":"crossref","unstructured":"Arbel\u00e1ez, P., Pont-Tuset, J., Barron, J.T., Marques, F., Malik, J.: Multiscale combinatorial grouping. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 328\u2013335 (2014)","DOI":"10.1109\/CVPR.2014.49"},{"key":"27_CR3","doi-asserted-by":"crossref","unstructured":"Bolya, D., Zhou, C., Xiao, F., Lee, Y.J.: Yolact: real-time instance segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9157\u20139166 (2019)","DOI":"10.1109\/ICCV.2019.00925"},{"key":"27_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-End object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"27_CR5","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"issue":"4","key":"27_CR6","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"LC Chen","year":"2017","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"27_CR7","unstructured":"Chen, L.C., Papandreou, G., Schroff, F., Adam, H.: Rethinking atrous convolution for semantic image segmentation. arXiv preprint arXiv:1706.05587 (2017)"},{"key":"27_CR8","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Chen, X., Mottaghi, R., Liu, X., Fidler, S., Urtasun, R., Yuille, A.: Detect what you can: detecting and representing objects using holistic models and body parts (2014)","DOI":"10.1109\/CVPR.2014.254"},{"key":"27_CR10","doi-asserted-by":"crossref","unstructured":"Chen, X., Zhao, Z., Zhang, Y., Duan, M., Qi, D., Zhao, H.: Focalclick: towards practical interactive image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1300\u20131309 (2022)","DOI":"10.1109\/CVPR52688.2022.00136"},{"key":"27_CR11","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A. G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"27_CR12","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3213\u20133223 (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"de Geus, D., Meletis, P., Lu, C., Wen, X., Dubbelman, G.: Part-aware panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5485\u20135494 (2021)","DOI":"10.1109\/CVPR46437.2021.00544"},{"key":"27_CR14","unstructured":"Ding, Z., Wang, J., Tu, Z.: Open-vocabulary panoptic segmentation with maskclip. arXiv preprint arXiv:2208.08984 (2022)"},{"key":"27_CR15","unstructured":"Everingham, M., Winn, J.: The pascal visual object classes challenge 2012 (voc2012) development kit. In: Pattern Analysis, Statistical Modelling and Computational Learning, Technical Report, vol. 8, no. 5 (2011)"},{"issue":"9","key":"27_CR16","doi-asserted-by":"publisher","first-page":"1627","DOI":"10.1109\/TPAMI.2009.167","volume":"32","author":"PF Felzenszwalb","year":"2009","unstructured":"Felzenszwalb, P.F., Girshick, R.B., McAllester, D., Ramanan, D.: Object detection with discriminatively trained part-based models. IEEE Trans. Pattern Anal. Mach. Intell. 32(9), 1627\u20131645 (2009)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"1","key":"27_CR17","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/0031-3203(81)90028-5","volume":"13","author":"F King-Sun","year":"1981","unstructured":"King-Sun, F., Mui, J.K.: A survey on image segmentation. Pattern Recogn. 13(1), 3\u201316 (1981)","journal-title":"Pattern Recogn."},{"key":"27_CR18","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Open-vocabulary image segmentation. arXiv preprint arXiv:2112.12143 (2021)"},{"issue":"11","key":"27_CR19","doi-asserted-by":"publisher","first-page":"1768","DOI":"10.1109\/TPAMI.2006.233","volume":"28","author":"L Grady","year":"2006","unstructured":"Grady, L.: Random walks for image segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 28(11), 1768\u20131783 (2006)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"27_CR20","doi-asserted-by":"crossref","unstructured":"Grundmann, M., Kwatra, V., Han, M., Essa, I.: Efficient hierarchical graph-based video segmentation. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 2141\u20132148. IEEE (2010)","DOI":"10.1109\/CVPR.2010.5539893"},{"key":"27_CR21","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: Lvis: a dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"27_CR22","unstructured":"Guzman-Rivera, A., Batra, D., Kohli, P.: Multiple choice learning: learning to produce multiple structured outputs. Adv. Neural Inf. Process. Syst. 25 (2012)"},{"key":"27_CR23","unstructured":"He, J., et al.: Partimagenet: a large, high-quality dataset of parts. arXiv preprint arXiv:2112.00933 (2021)"},{"key":"27_CR24","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"27_CR25","doi-asserted-by":"crossref","unstructured":"Jagadeesh, S.K., Schuster, R., Stricker, D.: Multi-task fusion for efficient panoptic-part segmentation. arXiv preprint arXiv:2212.07671 (2022)","DOI":"10.5220\/0011616000003411"},{"key":"27_CR26","doi-asserted-by":"crossref","unstructured":"Jain, J., Li, J., Chiu, M.T., Hassani, A., Orlov, N., Shi, H.: Oneformer: one transformer to rule universal image segmentation. arXiv preprint arXiv:2211.06220 (2022)","DOI":"10.1109\/CVPR52729.2023.00292"},{"key":"27_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"205","DOI":"10.1007\/978-3-030-58601-0_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"R Ji","year":"2020","unstructured":"Ji, R., et al.: Learning semantic neural tree for human parsing. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12358, pp. 205\u2013221. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58601-0_13"},{"key":"27_CR28","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision, In: ICML (2021)"},{"key":"27_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"316","DOI":"10.1007\/978-3-030-58452-8_19","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Jia","year":"2020","unstructured":"Jia, M., et al.: Fashionpedia: ontology, segmentation, and an attribute localization dataset. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 316\u2013332. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_19"},{"key":"27_CR30","doi-asserted-by":"crossref","unstructured":"Kirillov, A., He, K., Girshick, R., Rother, C., Doll\u00e1r, P.: Panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9404\u20139413 (2019)","DOI":"10.1109\/CVPR.2019.00963"},{"key":"27_CR31","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"27_CR32","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. arXiv preprint arXiv:2201.03546 (2022)"},{"key":"27_CR33","doi-asserted-by":"crossref","unstructured":"Li, F., et\u00a0al.: Mask dino: towards a unified transformer-based framework for object detection and segmentation. arXiv preprint arXiv:2206.02777 (2022)","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"27_CR34","doi-asserted-by":"crossref","unstructured":"Li, Q., Arnab, A., Torr, P.H.S.: Holistic, instance-level human parsing. arXiv preprint arXiv:1709.03612 (2017)","DOI":"10.5244\/C.31.25"},{"key":"27_CR35","doi-asserted-by":"publisher","unstructured":"Li, X., Xu, S., Yang, Y., Cheng, G., Tong, Y., Tao, D.: Panoptic-partformer: learning a unified model for panoptic part segmentation. In: European Conference on Computer Vision, pp. 729\u2013747. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19812-0_42","DOI":"10.1007\/978-3-031-19812-0_42"},{"issue":"3","key":"27_CR36","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1145\/1015706.1015719","volume":"23","author":"Y Li","year":"2004","unstructured":"Li, Y., Sun, J., Tang, C.-K., Shum, H.-Y.: Lazy snapping. ACM Trans. Graph. (ToG) 23(3), 303\u2013308 (2004)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"27_CR37","doi-asserted-by":"crossref","unstructured":"Li, Z., Chen, Q., Koltun, V.: Interactive image segmentation with latent diversity. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 577\u2013585 (2018)","DOI":"10.1109\/CVPR.2018.00067"},{"key":"27_CR38","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Panoptic segformer: delving deeper into panoptic segmentation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1280\u20131289 (2022)","DOI":"10.1109\/CVPR52688.2022.00134"},{"key":"27_CR39","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Liu, Q., Xu, Z., Bertasius, G., Niethammer, M.: Simpleclick: interactive image segmentation with simple vision transformers. arXiv preprint arXiv:2210.11006 (2022)","DOI":"10.1109\/ICCV51070.2023.02037"},{"key":"27_CR41","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"27_CR42","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"27_CR43","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"27_CR44","unstructured":"Meletis, P., Wen, X., Lu, C., de Geus, D., Dubbelman, G.: Cityscapes-panoptic-parts and pascal-panoptic-parts datasets for scene understanding. arXiv preprint arXiv:2004.07944 (2020)"},{"issue":"7","key":"27_CR45","first-page":"3523","volume":"44","author":"S Minaee","year":"2021","unstructured":"Minaee, S., Boykov, Y., Porikli, F., Plaza, A., Kehtarnavaz, N., Terzopoulos, D.: Image segmentation using deep learning: a survey. IEEE Trans. Pattern Anal. Mach. Intell. 44(7), 3523\u20133542 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"27_CR46","unstructured":"OpenAI. Chatgpt (2022). https:\/\/openai.com\/blog\/chatgpt"},{"key":"27_CR47","unstructured":"OpenAI. Gpt-4 technical report (2023)"},{"key":"27_CR48","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"27_CR49","doi-asserted-by":"crossref","unstructured":"Ramanathan, V., et al.: PACO: parts and attributes of common objects. In arXiv preprint arXiv:2301.01795 (2023)","DOI":"10.1109\/CVPR52729.2023.00690"},{"key":"27_CR50","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: Denseclip: language-guided dense prediction with context-aware prompting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18082\u201318091 (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"27_CR51","first-page":"91","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst. 28, 91\u201399 (2015)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"27_CR52","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"27_CR53","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8430\u20138439 (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"27_CR54","doi-asserted-by":"crossref","unstructured":"Song, X., et al.: Apollocar3d: a large 3d car instance understanding benchmark for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5452\u20135462 (2019)","DOI":"10.1109\/CVPR.2019.00560"},{"key":"27_CR55","doi-asserted-by":"crossref","unstructured":"Sun, P., et al.: Going denser with open-vocabulary part segmentation (2023)","DOI":"10.1109\/ICCV51070.2023.01417"},{"key":"27_CR56","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"27_CR57","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: The caltech-ucsd birds-200-2011 dataset. Technical Report (2011)"},{"key":"27_CR58","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, X., Cao, Y., Wang, W., Shen, C., Huang, T.: Seggpt: segmenting everything in context. arXiv preprint arXiv:2304.03284 (2023)","DOI":"10.1109\/ICCV51070.2023.00110"},{"key":"27_CR59","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: Groupvit: semantic segmentation emerges from text supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18134\u201318144 (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"27_CR60","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., De\u00a0Mello, S.: Open-vocabulary panoptic segmentation with text-to-image diffusion models. arXiv preprint arXiv:2303.04803 (2023)","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"27_CR61","doi-asserted-by":"crossref","unstructured":"Xu, N., Price, B., Cohen, S., Yang, J., Huang, T.S.: Deep interactive object selection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 373\u2013381 (2016)","DOI":"10.1109\/CVPR.2016.47"},{"key":"27_CR62","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Unified contrastive learning in image-text-label space. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01857"},{"key":"27_CR63","doi-asserted-by":"crossref","unstructured":"Yang, L., Song, Q., Wang, Z., Jiang, M.: Parsing r-cnn for instance-level human analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 364\u2013373 (2019)","DOI":"10.1109\/CVPR.2019.00045"},{"key":"27_CR64","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: Mp-former: mask-piloted transformer for image segmentation. arXiv preprint arXiv:2303.07336 (2023)","DOI":"10.1109\/CVPR52729.2023.01733"},{"key":"27_CR65","unstructured":"Zhang, H., et al.: A simple framework for open-vocabulary segmentation and detection. arXiv preprint arXiv:2303.08131 (2023)"},{"key":"27_CR66","doi-asserted-by":"crossref","unstructured":"Zhang, L., Agrawala, M.: Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"27_CR67","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ade20k dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 633\u2013641 (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"27_CR68","doi-asserted-by":"crossref","unstructured":"Zhou, B., et al.: Semantic understanding of scenes through the ade20k dataset (2018)","DOI":"10.1007\/s11263-018-1140-0"},{"key":"27_CR69","unstructured":"Zou, X., et\u00a0al.: Generalized decoding for pixel, image, and language. arXiv preprint arXiv:2212.11270 (2022)"},{"key":"27_CR70","unstructured":"Zou, X., et al.: Segment everything everywhere all at once. arXiv preprint arXiv:2304.06718 (2023)"},{"key":"27_CR71","unstructured":"Zou, Z., Shi, Z., Guo, Y., Ye, J.: Object detection in 20 years: a survey. arXiv preprint arXiv:1905.05055 (2019)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73195-2_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T10:15:17Z","timestamp":1732616117000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73195-2_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,27]]},"ISBN":["9783031731945","9783031731952"],"references-count":71,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73195-2_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,27]]},"assertion":[{"value":"27 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}