{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T03:02:52Z","timestamp":1770087772289,"version":"3.49.0"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach. Intell. Res."],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1007\/s11633-025-1610-0","type":"journal-article","created":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T14:11:27Z","timestamp":1770041487000},"page":"214-226","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Hierarchical Open-vocabulary Part-object Segmentation with Knowledge-guided SAM"],"prefix":"10.1007","volume":"23","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-8808-1764","authenticated-orcid":false,"given":"Xin-Jian","family":"Wu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6743-4175","authenticated-orcid":false,"given":"Cheng-Lin","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,2]]},"reference":[{"key":"1610_CR1","doi-asserted-by":"publisher","first-page":"833","DOI":"10.1007\/978-3-030-01234-2_49","volume-title":"Proceedings of the 15th European Conference on Computer Vision","author":"L C Chen","year":"2018","unstructured":"L. C. Chen, Y. Zhu, G. Papandreou, F. Schroff, H. Adam. Encoder-decoder with atrous separable convolution for semantic image segmentation. In Proceedings of the 15th European Conference on Computer Vision, Munich, Germany, pp. 833\u2013851, 2018. DOI: https:\/\/doi.org\/10.1007\/978-3-030-01234-2_49."},{"key":"1610_CR2","doi-asserted-by":"publisher","first-page":"3431","DOI":"10.1109\/CVPR.2015.7298965","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","author":"J Long","year":"2015","unstructured":"J. Long, E. Shelhamer, T. Darrell. Fully convolutional networks for semantic segmentation. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, Boston, USA, pp. 3431\u20133440, 2015. DOI: https:\/\/doi.org\/10.1109\/CVPR.2015.7298965."},{"key":"1610_CR3","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-2457-4_28","volume-title":"Proceedings of the 18th International Conference on Medical Image Computing and Computer-Assisted Intervention","author":"O Ronneberger","year":"2015","unstructured":"O. Ronneberger, P. Fischer, T. Brox. U-Net: Convolutional networks for biomedical image segmentation. In Proceedings of the 18th International Conference on Medical Image Computing and Computer-Assisted Intervention, Munich, Germany, pp. 234\u2013241, 2015. DOI: https:\/\/doi.org\/10.1007\/978-3-319-2457-4_28."},{"key":"1610_CR4","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"A Radford","year":"2021","unstructured":"A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, G. Krueger, I. Sutskever. Learning transferable visual models from natural language supervision. In Proceedings of the 38th International Conference on Machine Learning, pp. 8748\u20138763, 2021."},{"key":"1610_CR5","doi-asserted-by":"publisher","first-page":"4113","DOI":"10.1109\/CV-PR52733.2024.00394","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"S Cho","year":"2024","unstructured":"S. Cho, H. Shin, S. Hong, A. Arnab, P. H. Seo, S. Kim. CAT-Seg: Cost aggregation for open-vocabulary semantic segmentation. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Seattle, USA, pp. 4113\u20134123, 2024. DOI: https:\/\/doi.org\/10.1109\/CV-PR52733.2024.00394."},{"key":"1610_CR6","doi-asserted-by":"publisher","first-page":"540","DOI":"10.1007\/978-3-031-20059-5_31","volume-title":"Proceedings of the 17th European Conference on Computer Vision","author":"G Ghiasi","year":"2022","unstructured":"G. Ghiasi, X. Gu, Y. Cui, T. Y. Lin. Scaling open-vocabulary image segmentation with image-level labels. In Proceedings of the 17th European Conference on Computer Vision, Tel Aviv, Israel, pp. 540\u2013557, 2022. DOI: https:\/\/doi.org\/10.1007\/978-3-031-20059-5_31."},{"key":"1610_CR7","doi-asserted-by":"publisher","first-page":"399","DOI":"10.1007\/978-3-031-73414-4_23","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"S Jiao","year":"2025","unstructured":"S. Jiao, H. Zhu, J. Huang, Y. Zhao, Y. Wei, H. Shi. Collaborative vision-text representation optimizing for open-vocabulary segmentation. In Proceedings of the 18th European Conference on Computer Vision, Milan, Italy, pp. 399\u2013416, 2025. DOI: https:\/\/doi.org\/10.1007\/978-3-031-73414-4_23."},{"key":"1610_CR8","doi-asserted-by":"publisher","first-page":"3992","DOI":"10.1109\/ICCV51070.2023.00371","volume-title":"Proceedings of IEEE\/CVF International Conference on Computer Vision","author":"A Kirillov","year":"2023","unstructured":"A. Kirillov, E. Mintun, N. Ravi, H. Mao, C. Rolland, L. Gustafson, T. Xiao, S. Whitehead, A. C. Berg, W. Y. Lo, P. Dollar, R. Girshick. Segment anything. In Proceedings of IEEE\/CVF International Conference on Computer Vision, Paris, France, pp. 3992\u20134003, 2023. DOI: https:\/\/doi.org\/10.1109\/ICCV51070.2023.00371."},{"key":"1610_CR9","doi-asserted-by":"publisher","unstructured":"S. Li, J. Cao, P. Ye, Y. Ding, C. Tu, T. Chen. ClipSAM: CLIP and SAM collaboration for zero-shot anomaly segmentation. Neurocomputing, vol. 618, Article number 129122, 2025. DOI: https:\/\/doi.org\/10.1016\/j.neucom.2024.129122.","DOI":"10.1016\/j.neucom.2024.129122"},{"key":"1610_CR10","doi-asserted-by":"publisher","first-page":"3635","DOI":"10.1109\/CVPRW63382.2024.00367","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"H Wang","year":"2024","unstructured":"H. Wang, P. K. A. Vasu, F. Faghri, R. Vemulapalli, M. Farajtabar, S. Mehta, M. Rastegari, O. Tuzel, H. Pouransari. SAM-CLIP: Merging vision foundation models towards semantic and spatial understanding. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Seattle, USA, pp. 3635\u20133647, 2024. DOI: https:\/\/doi.org\/10.1109\/CVPRW63382.2024.00367."},{"key":"1610_CR11","doi-asserted-by":"publisher","first-page":"419","DOI":"10.1007\/978-3-031-72775-7_24","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"H Yuan","year":"2024","unstructured":"H. Yuan, X. Li, C. Zhou, Y. Li, K. Chen, C. C. Loy. Open-vocabulary SAM: Segment and recognize twenty-thousand classes interactively. In Proceedings of the 18th European Conference on Computer Vision, Milan, Italy, pp. 419\u2013437, 2024. DOI: https:\/\/doi.org\/10.1007\/978-3-031-72775-7_24."},{"issue":"3","key":"1610_CR12","doi-asserted-by":"publisher","first-page":"431","DOI":"10.1007\/s11633-022-1404-6","volume":"21","author":"Y Zhao","year":"2024","unstructured":"Y. Zhao, J. Li, Y. Tian. Parsing objects at a finer granularity: A survey. Machine Intelligence Research, vol. 21, no. 3, pp. 431\u2013451, 2024. DOI: https:\/\/doi.org\/10.1007\/s11633-022-1404-6.","journal-title":"Machine Intelligence Research"},{"key":"1610_CR13","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"M Bucher","year":"2019","unstructured":"M. Bucher, T. H. Vu, M. Cord, P. P\u00e9rez. Zero-shot semantic segmentation. In Proceedings of the 33rd International Conference on Neural Information Processing Systems, Vancouver, Canada, Article number 43, 2019."},{"key":"1610_CR14","doi-asserted-by":"publisher","first-page":"8248","DOI":"10.1109\/CVPR.2019.00845","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Xian","year":"2019","unstructured":"Y. Xian, S. Choudhury, Y. He, B. Schiele, Z. Akata. Semantic projection network for zero- and few-label semantic segmentation. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Long Beach, USA, pp. 8248\u20138257, 2019. DOI: https:\/\/doi.org\/10.1109\/CVPR.2019.00845."},{"key":"1610_CR15","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/7287.001.0001","volume-title":"WordNet: An Electronic Lexical Database","author":"C Fellbaum","year":"1998","unstructured":"C. Fellbaum. WordNet: An Electronic Lexical Database, Cambridge, USA: MIT Press, 1998. DOI: https:\/\/doi.org\/10.7551\/mit-press\/7287.001.0001."},{"key":"1610_CR16","volume-title":"Proceedings of the 1st International Conference on Learning Representations","author":"T Mikolov","year":"2013","unstructured":"T. Mikolov, K. Chen, G. Corrado, J. Dean. Efficient estimation of word representations in vector space. In Proceedings of the 1st International Conference on Learning Representations, Scottsdale, USA, 2013."},{"key":"1610_CR17","volume-title":"Proceedings of the 10th International Conference on Learning Representations","author":"B Li","year":"2022","unstructured":"B. Li, K. Q. Weinberger, S. J. Belongie, V. Koltun, R. Ranftl. Language-driven semantic segmentation. In Proceedings of the 10th International Conference on Learning Representations, 2022."},{"key":"1610_CR18","doi-asserted-by":"publisher","first-page":"11573","DOI":"10.1109\/CVPR52688.2022.01129","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"J Ding","year":"2022","unstructured":"J. Ding, N. Xue, G. S. Xia, D. Dai. Decoupling zero-shot semantic segmentation. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, New Orleans, USA, pp. 11573\u201311582, 2022. DOI: https:\/\/doi.org\/10.1109\/CVPR52688.2022.01129."},{"key":"1610_CR19","doi-asserted-by":"publisher","first-page":"736","DOI":"10.1007\/978-3-031-19818-2_42","volume-title":"Proceedings of the 17th European Conference on Computer Vision","author":"M Xu","year":"2022","unstructured":"M. Xu, Z. Zhang, F. Wei, Y. Lin, Y. Cao, H. Hu, X. Bai. A simple baseline for open-vocabulary semantic segmentation with pre-trained vision-language model. In Proceedings of the 17th European Conference on Computer Vision, Tel Aviv, Israel, pp. 736\u2013753, 2022. DOI: https:\/\/doi.org\/10.1007\/978-3-031-19818-2_42."},{"key":"1610_CR20","volume-title":"Open-vocabulary panoptic segmentation with maskCLIP","author":"Z Ding","year":"2022","unstructured":"Z. Ding, J. Wang, Z. Tu. Open-vocabulary panoptic segmentation with maskCLIP, [Online], Available: https:\/\/arxiv.org\/abs\/2208.08984v1, 2022."},{"key":"1610_CR21","doi-asserted-by":"publisher","first-page":"7061","DOI":"10.1109\/CVPR52729.2023.00682","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"F Liang","year":"2023","unstructured":"F. Liang, B. Wu, X. Dai, K. Li, Y. Zhao, H. Zhang, P. Zhang, P. Vajda, D. Marculescu. Open-vocabulary semantic segmentation with mask-adapted clip. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Vancouver, Canada, pp. 7061\u20137070, 2023. DOI: https:\/\/doi.org\/10.1109\/CVPR52729.2023.00682."},{"key":"1610_CR22","doi-asserted-by":"publisher","first-page":"2945","DOI":"10.1109\/CVPR52729.2023.00288","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"M Xu","year":"2023","unstructured":"M. Xu, Z. Zhang, F. Wei, H. Hu, X. Bai. Side adapter network for open-vocabulary semantic segmentation. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Vancouver, Canada, pp. 2945\u20132954, 2023. DOI: https:\/\/doi.org\/10.1109\/CVPR52729.2023.00288."},{"key":"1610_CR23","doi-asserted-by":"publisher","first-page":"11175","DOI":"10.1109\/CVPR52729.2023.01075","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Z Zhou","year":"2023","unstructured":"Z. Zhou, Y. Lei, B. Zhang, L. Liu, Y. Liu. ZegCLIP: Towards adapting CLIP for zero-shot semantic segmentation. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Vancouver, Canada, pp. 11175\u201311185, 2023. DOI: https:\/\/doi.org\/10.1109\/CVPR52729.2023.01075."},{"issue":"1","key":"1610_CR24","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/s11633-022-1369-5","volume":"20","author":"F L Chen","year":"2023","unstructured":"F. L. Chen, D. Z. Zhang, M. L. Han, X. Y. Chen, J. Shi, S. Xu, B. Xu. VLP: A survey on vision-language pretraining. Machine Intelligence Research, vol. 20, no. 1, pp. 38\u201356, 2023. DOI: https:\/\/doi.org\/10.1007\/s11633-022-1369-5.","journal-title":"Machine Intelligence Research"},{"key":"1610_CR25","volume-title":"VisualBERT: A simple and performant baseline for vision and language","author":"L H Li","year":"2019","unstructured":"L. H. Li, M. Yatskar, D. Yin, C. J. Hsieh, K. W. Chang. VisualBERT: A simple and performant baseline for vision and language, [Online], Available: https:\/\/arxiv.org\/abs\/1908.03557, 2019."},{"key":"1610_CR26","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"J Lu","year":"2019","unstructured":"J. Lu, D. Batra, D. Parikh, S. Lee. ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In Proceedings of the 33rd International Conference on Neural Information Processing Systems, Vancouver, Canada, Article number 2, 2019."},{"key":"1610_CR27","volume-title":"Proceedings of the 8th International Conference on Learning Representations","author":"W Su","year":"2020","unstructured":"W. Su, X. Zhu, Y. Cao, B. Li, L. Lu, F. Wei, J. Dai. VL-BERT: Pre-training of generic visual-linguistic representations. In Proceedings of the 8th International Conference on Learning Representations, Addis Ababa, Ethiopia, 2020."},{"key":"1610_CR28","first-page":"4904","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"C Jia","year":"2021","unstructured":"C. Jia, Y. Yang, Y. Xia, Y. T. Chen, Z. Parekh, H. Pham, Q. V. Le, Y. H. Sung, Z. Li, T. Duerig. Scaling up visual and vision-language representation learning with noisy text supervision. In Proceedings of the 38th International Conference on Machine Learning, pp. 4904\u20134916, 2021."},{"key":"1610_CR29","volume-title":"Proceedings of the 10th International Conference on Learning Representations","author":"X Gu","year":"2021","unstructured":"X. Gu, T. Y. Lin, W. Kuo, Y. Cui. Open-vocabulary object detection via vision and language knowledge distillation. In Proceedings of the 10th International Conference on Learning Representations, 2021."},{"key":"1610_CR30","doi-asserted-by":"publisher","first-page":"14388","DOI":"10.1109\/CVPR46437.2021.01416","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"A Zareian","year":"2021","unstructured":"A. Zareian, K. D. Rosa, D. H. Hu, S. F. Chang. Open-vocabulary object detection using captions. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Nashville, USA, pp. 14388\u201314397, 2021. DOI: https:\/\/doi.org\/10.1109\/CVPR46437.2021.01416."},{"issue":"4","key":"1610_CR31","doi-asserted-by":"publisher","first-page":"617","DOI":"10.1007\/s11633-023-1385-0","volume":"21","author":"W Ji","year":"2024","unstructured":"W. Ji, J. Li, Q. Bi, T. Liu, W. Li, L. Cheng. Segment anything is not always perfect: An investigation of SAM on different real-world applications. Machine Intelligence Research, vol. 21, no. 4, pp. 617\u2013630, 2024. DOI: https:\/\/doi.org\/10.1007\/s11633-023-1385-0.","journal-title":"Machine Intelligence Research"},{"key":"1610_CR32","volume-title":"Grounded SAM: Assembling open-world models for diverse visual tasks","author":"T Ren","year":"2024","unstructured":"T. Ren, S. Liu, A. Zeng, J. Lin, K. Li, H. Cao, J. Chen, X. Huang, Y. Chen, F. Yan, Z. Zeng, H. Zhang, F. Li, J. Yang, H. Li, Q. Jiang, L. Zhang. Grounded SAM: Assembling open-world models for diverse visual tasks, [Online], Available: https:\/\/arxiv.org\/abs\/2401.14159, 2024."},{"key":"1610_CR33","volume-title":"Track anything: Segment anything meets videos","author":"J Yang","year":"2023","unstructured":"J. Yang, M. Gao, Z. Li, S. Gao, F. Wang, F. Zheng. Track anything: Segment anything meets videos, [Online], Available: https:\/\/arxiv.org\/abs\/2304.11968, 2023."},{"key":"1610_CR34","doi-asserted-by":"publisher","unstructured":"J. Ma, Y. He, F. Li, L. Han, C. You, B. Wang. Segment anything in medical images. Nature Communications, vol. 15, no. 1, Article number 654, 2024. DOI: https:\/\/doi.org\/10.1038\/s41467-024-44824-z.","DOI":"10.1038\/s41467-024-44824-z"},{"key":"1610_CR35","doi-asserted-by":"publisher","first-page":"314","DOI":"10.1007\/978-3-031-72784-918","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"X J Wu","year":"2024","unstructured":"X. J. Wu, R. Zhang, J. Qin, S. Ma, C. L. Liu. WPS-SAM: Towards weakly-supervised part segmentation with foundation models. In Proceedings of the 18th European Conference on Computer Vision, Milan, Italy, pp. 314\u2013333, 2024. DOI: https:\/\/doi.org\/10.1007\/978-3-031-72784-918."},{"key":"1610_CR36","doi-asserted-by":"publisher","first-page":"10995","DOI":"10.1109\/CVPR52729.2023.01058","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"X Dong","year":"2023","unstructured":"X. Dong, J. Bao, Y. Zheng, T. Zhang, D. Chen, H. Yang, M. Zeng, W. Zhang, L. Yuan, D. Chen, F. Wen, N. Yu. MaskCLIP: Masked self-distillation advances contrastive language-image pretraining. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Vancouver, Canada, pp. 10995\u201311005, 2023. DOI: https:\/\/doi.org\/10.1109\/CVPR52729.2023.01058."},{"key":"1610_CR37","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-981-97-5498-4_1","volume-title":"Proceedings of the 17th International Conference on Knowledge Science, Engineering and Management","author":"N Ding","year":"2024","unstructured":"N. Ding, Y. Lai, J. Liu. Knowledge enhanced zero-shot visual relationship detection. In Proceedings of the 17th International Conference on Knowledge Science, Engineering and Management, Birmingham, UK, pp. 3\u201315, 2024. DOI: https:\/\/doi.org\/10.1007\/978-981-97-5498-4_1."},{"key":"1610_CR38","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1109\/CVPR.2017.10","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","author":"K Marino","year":"2017","unstructured":"K. Marino, R. Salakhutdinov, A. Gupta. The more you know: Using knowledge graphs for image classification. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, Honolulu, USA, pp. 20\u201328, 2017. DOI: https:\/\/doi.org\/10.1109\/CVPR.2017.10."},{"key":"1610_CR39","doi-asserted-by":"publisher","first-page":"11487","DOI":"10.1109\/CVPR.2019.01175","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"M Kampffmeyer","year":"2019","unstructured":"M. Kampffmeyer, Y. Chen, X. Liang, H. Wang, Y. Zhang, E. P. Xing. Rethinking knowledge graph propagation for zero-shot learning. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Long Beach, USA, pp. 11487\u201311496, 2019. DOI: https:\/\/doi.org\/10.1109\/CVPR.2019.01175."},{"key":"1610_CR40","doi-asserted-by":"publisher","first-page":"6857","DOI":"10.1109\/CVPR.2018.00717","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"X Wang","year":"2018","unstructured":"X. Wang, Y. Ye, A. Gupta. Zero-shot recognition via semantic embeddings and knowledge graphs. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Salt Lake City, USA, pp. 6857\u20136866, 2018. DOI: https:\/\/doi.org\/10.1109\/CVPR.2018.00717."},{"key":"1610_CR41","doi-asserted-by":"publisher","first-page":"8876","DOI":"10.1609\/aaai.v33i01.33018876","volume-title":"Proceedings of the 33rd AAAI Conference on Artificial Intelligence","author":"S Shah","year":"2019","unstructured":"S. Shah, A. Mishra, N. Yadati, P. P. Talukdar. KVQA: Knowledge-aware visual question answering. In Proceedings of the 33rd AAAI Conference on Artificial Intelligence, Honolulu, USA, pp. 8876\u20138884, 2019. DOI: https:\/\/doi.org\/10.1609\/aaai.v33i01.33018876."},{"key":"1610_CR42","doi-asserted-by":"publisher","first-page":"1236","DOI":"10.1109\/CVPR52688.2022.00131","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"L Li","year":"2022","unstructured":"L. Li, T. Zhou, W. Wang, J. Li, Y. Yang. Deep hierarchical semantic segmentation. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, New Orleans, USA, pp. 1236\u20131247, 2022. DOI: https:\/\/doi.org\/10.1109\/CVPR52688.2022.00131."},{"key":"1610_CR43","doi-asserted-by":"publisher","first-page":"15979","DOI":"10.1109\/CVPR52688.2022.01553","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"K He","year":"2022","unstructured":"K. He, X. Chen, S. Xie, Y. Li, P. Dollar, R. Girshick. Masked autoencoders are scalable vision learners. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, New Orleans, USA, pp. 15979\u201315988, 2022. DOI: https:\/\/doi.org\/10.1109\/CVPR52688.2022.01553."},{"key":"1610_CR44","doi-asserted-by":"publisher","first-page":"128","DOI":"10.1007\/978-3-031-20074-88","volume-title":"Proceedings of the 17th European Conference on Computer Vision","author":"J He","year":"2022","unstructured":"J. He, S. Yang, S. Yang, A. Kortylewski, X. Yuan, J. N. Chen, S. Liu, C. Yang, Q. Yu, A. Yuille. PartImageNet: A large, high-quality dataset of parts. In Proceedings of the 17th European Conference on Computer Vision, Tel Aviv, Israel, pp. 128\u2013145, 2022. DOI: https:\/\/doi.org\/10.1007\/978-3-031-20074-88."},{"key":"1610_CR45","doi-asserted-by":"publisher","first-page":"1979","DOI":"10.1109\/CVPR.2014.254","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","author":"X Chen","year":"2014","unstructured":"X. Chen, R. Mottaghi, X. Liu, S. Fidler, R. Urtasun, A. Yuille. Detect what you can: Detecting and representing objects using holistic models and body parts. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, Columbus, USA, pp. 1979\u20131986, 2014. DOI: https:\/\/doi.org\/10.1109\/CVPR.2014.254."},{"issue":"2\u20133","key":"1610_CR46","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"M. Everingham, L. Van Gool, C. K. I. Williams, J. Winn, A. Zisserman. The PASCAL visual object classes (VOC) challenge. International Journal of Computer Vision, vol. 88, no. 2\u20133, pp. 303\u2013338, 2010. DOI: https:\/\/doi.org\/10.1007\/s11263-009-0275-4.","journal-title":"International Journal of Computer Vision"},{"issue":"1","key":"1610_CR47","doi-asserted-by":"publisher","first-page":"33","DOI":"10.3233\/IA-160093","volume":"10","author":"I Donadello","year":"2016","unstructured":"I. Donadello, L. Serafini. Integration of numeric and symbolic information for semantic image interpretation. Intelligenza Artificiale, vol. 10, no. 1, pp. 33\u201347, 2016. DOI: https:\/\/doi.org\/10.3233\/IA-160093.","journal-title":"Intelligenza Artificiale"},{"key":"1610_CR48","doi-asserted-by":"publisher","first-page":"7010","DOI":"10.1109\/CV-PR52688.2022.00689","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"D Huynh","year":"2022","unstructured":"D. Huynh, J. Kuen, Z. Lin, J. Gu, E. Elhamifar. Open-vocabulary instance segmentation via robust cross-modal pseudo-labeling. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, New Orleans, USA, pp. 7010\u20137021, 2022. DOI: https:\/\/doi.org\/10.1109\/CV-PR52688.2022.00689."},{"key":"1610_CR49","doi-asserted-by":"publisher","first-page":"887","DOI":"10.1109\/ICCV51070.2023.00088","volume-title":"Proceedings of IEEE\/CVF International Conference on Computer Vision","author":"X Xu","year":"2023","unstructured":"X. Xu, T. Xiong, Z. Ding, Z. Tu. MasQCLIP for open-vocabulary universal image segmentation. In Proceedings of IEEE\/CVF International Conference on Computer Vision, Paris, France, pp. 887\u2013898, 2023. DOI: https:\/\/doi.org\/10.1109\/ICCV51070.2023.00088."}],"container-title":["Machine Intelligence Research"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11633-025-1610-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11633-025-1610-0","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11633-025-1610-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T15:04:01Z","timestamp":1770044641000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11633-025-1610-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":49,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,2]]}},"alternative-id":["1610"],"URL":"https:\/\/doi.org\/10.1007\/s11633-025-1610-0","relation":{},"ISSN":["2731-538X","2731-5398"],"issn-type":[{"value":"2731-538X","type":"print"},{"value":"2731-5398","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2]]},"assertion":[{"value":"10 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 October 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declared that they have no conflicts of interest to this work.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations of conflict of interest"}}]}}