{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T13:10:37Z","timestamp":1779282637660,"version":"3.51.4"},"reference-count":82,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T00:00:00Z","timestamp":1773100800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T00:00:00Z","timestamp":1773100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62172247"],"award-info":[{"award-number":["62172247"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62172247"],"award-info":[{"award-number":["62172247"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100014761","name":"Qingdao Natural Science Foundation","doi-asserted-by":"crossref","award":["No. 23- 2-1-163-zyyd-jch"],"award-info":[{"award-number":["No. 23- 2-1-163-zyyd-jch"]}],"id":[{"id":"10.13039\/501100014761","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100014761","name":"Qingdao Natural Science Foundation","doi-asserted-by":"crossref","award":["No. 23- 2-1-163-zyyd-jch"],"award-info":[{"award-number":["No. 23- 2-1-163-zyyd-jch"]}],"id":[{"id":"10.13039\/501100014761","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Textile Plus Joint Research Program of Qingdao University","award":["No. FZ2024101"],"award-info":[{"award-number":["No. FZ2024101"]}]},{"name":"Textile Plus Joint Research Program of Qingdao University","award":["No. FZ2024101"],"award-info":[{"award-number":["No. FZ2024101"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1007\/s00530-026-02267-0","type":"journal-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T13:57:55Z","timestamp":1773151075000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["RobustOVS: open-vocabulary segmentation with robustly semantic-assisted calibration"],"prefix":"10.1007","volume":"32","author":[{"given":"Ruihan","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guodong","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingtao","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,3,10]]},"reference":[{"key":"2267_CR1","doi-asserted-by":"crossref","unstructured":"Chen, L.-C., Papandreou, G., Iasonas Kokkinos,Kevin Murphy, and, Alan, L., Yuille: Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE transactions on pattern analysis and machine intelligence, 40(4):834\u2013848, 1, 2, 4, 6 (2017)","DOI":"10.1109\/TPAMI.2017.2699184"},{"issue":"1","key":"2267_CR2","first-page":"6","volume":"2","author":"J Ding","year":"2022","unstructured":"Ding, J., Xue, N., Xia, G.-S., Dai, D.: Decoupling zero-shot semantic segmentation. CVPR. 2(1), 6 (2022)","journal-title":"CVPR"},{"issue":"2","key":"2267_CR3","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/s11263-014-0733-5","volume":"111","author":"M Everingham","year":"2015","unstructured":"Everingham, M., Eslami, S.M.A., Van Gool, L., Williams, C.K.I.: John Winn, and Andrew Zisserman. The Pascal visual object classes challenge: A retrospective. IJCV. 111(2), 98\u2013136 (2015)","journal-title":"IJCV"},{"key":"2267_CR4","first-page":"6","volume":"5","author":"X Roozbeh Mottaghi","year":"2014","unstructured":"Roozbeh Mottaghi, X., Chen, X., Liu, N.-G., Cho, S.-W., Lee, S., Fidler, R., Urtasun, Alan, L.: Yuille. The role of context for object detection and semantic segmentation in the wild. CVPR. 5, 6 (2014)","journal-title":"CVPR"},{"issue":"1","key":"2267_CR5","first-page":"7","volume":"2","author":"M Xu","year":"2021","unstructured":"Xu, M., Zhang, Z., Wei, F., Lin, Y., Cao, Y., Han Hu, and, Bai, X.: A simple baseline for Zeroshot semantic segmentation with pre-trained vision-language model. ArXiv Preprint arXiv:2112 14757. 2(1), 7 (2021)","journal-title":"ArXiv Preprint arXiv:2112 14757"},{"key":"2267_CR6","doi-asserted-by":"crossref","unstructured":"Wang, L., Lu, H., Wang, Y., Feng, M., Wang, D., Yin, B., Ruan, X.: Learning to detect salient objects with image-level supervision. In CVPR, pages 136\u2013145, 3 (2017)","DOI":"10.1109\/CVPR.2017.404"},{"key":"2267_CR7","unstructured":"Ding, J., Xue, N., Xia, G.-S., Dai, D.: Decoupling zero-shot semantic segmentation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 11583\u201311592, 1, 3, 4, 6, 7, 11, 12 (2022)"},{"key":"2267_CR8","unstructured":"Golnaz Ghiasi, X., Gu, Y., Cui, Tsung-Yi, Lin: Open-vocabulary image segmentation. arXiv preprint arXiv:2112.12143, 2021. 1, 2, 3, 4, 6, 7"},{"issue":"1","key":"2267_CR9","first-page":"7","volume":"3","author":"KQ Boyi Li","year":"2022","unstructured":"Boyi Li, K.Q., Weinberger, S., Belongie, V., Koltun, Ranftl, R.: Language-driven semantic segmentation. ArXiv Preprint arXiv:2201 03546. 3(1), 6, 7 (2022)","journal-title":"ArXiv Preprint arXiv:2201 03546"},{"issue":"1","key":"2267_CR10","first-page":"7","volume":"3","author":"M Xu","year":"2021","unstructured":"Xu, M., Zhang, Z., Wei, F., Lin, Y., Cao, Y., Han Hu, and, Bai, X.: A simple baseline for Zeroshot semantic segmentation with pre-trained vision-language model. ArXiv Preprint arXiv:2112 14757. 3(1), 7 (2021)","journal-title":"ArXiv Preprint arXiv:2112 14757"},{"key":"2267_CR11","doi-asserted-by":"crossref","unstructured":"Katherine Crowson, S., Biderman, D., Kornis, D., Stander, E., Hallahan, L., Castricato, Raff, E.: Vqgan-clip: Open domain image generation and editing with natural Language guidance. ArXiv Preprint arXiv:2204 08583, 3 (2022)","DOI":"10.1007\/978-3-031-19836-6_6"},{"key":"2267_CR12","unstructured":"Gu, X., Lin, T.-Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921, 2021. 3, 11"},{"key":"2267_CR13","unstructured":"Yiwu Zhong, J., Yang, P., Zhang, C., Li, N., Codella, L.H., Li, L., Zhou, X., Dai, L., Yuan, Y., Li, et al.: Regionclip: Regionbased language-image pretraining. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 16793\u201316803, 3, 4, 5 (2022)"},{"key":"2267_CR14","unstructured":"Ding, Z., Wang, J., Tu, Z.: Openvocabulary panoptic segmentation with maskclip. arXiv preprint arXiv:2208.08984, 3 (2022)"},{"key":"2267_CR15","unstructured":"Kim, K., Oh, Y., and Jong Chul Ye:. Zegot: Zeroshot segmentation through optimal transport of text prompts. arXiv preprint arXiv:2301.12171, 3 (2023)"},{"key":"2267_CR16","unstructured":"Luo, H., Bao, J., Wu, Y., He, X., Li, T.: Segclip: Patch aggregation with learnable centers for open-vocabulary semantic segmentation. arXiv preprint arXiv:2211.14813, 3 (2022)"},{"key":"2267_CR17","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Han Hu, and, Bai, X.: Side adapter network for open-vocabulary semantic segmentation. ArXiv Preprint arXiv:2302 12242, 3 (2023)","DOI":"10.1109\/CVPR52729.2023.00288"},{"issue":"3","key":"2267_CR18","first-page":"7","volume":"32","author":"T-H Maxime Bucher","year":"2019","unstructured":"Maxime Bucher, T.-H., Vu, M., Cord, Perez, P.: Zero-shot semantic segmentation. Adv. Neural. Inf. Process. Syst. 32(3), 6, 7 (2019)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2267_CR19","unstructured":"Yongqin Xian, S., Choudhury, Y., He, B., Schiele, and Zeynep Akata:. Semantic projection network for zero-and few-label semantic segmentation. In Proceedings of the IEEE\/CVF Conference on Computer VisionPattern Recognition, pages 8256\u20138265, 3, 6, 7 (2019)"},{"key":"2267_CR20","doi-asserted-by":"crossref","unstructured":"Xu, J., Mello, S.D., Liu, S., Byeon, W., Breuel, T., Kautz, J., Wang, X.: Groupvit: Semantic segmentation emerges from text supervision. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 18134\u201318144, 3 (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"2267_CR21","doi-asserted-by":"crossref","unstructured":"Brian, Lester: Rami Al-Rfou, and Noah Constant. The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691, 3 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"2267_CR22","unstructured":"Xiang Lisa Li and Percy Liang: Prefix-tuning: Optimizing continuous prompts for generation. ArXiv Preprint arXiv:2101 00190, 3 (2021)"},{"key":"2267_CR23","unstructured":"Pengfei Liu, W., Yuan, J., Fu, Z., Jiang, H., Hayashi, Neubig, G.: Pre-train, prompt, and predict: A systematic survey of prompting methods in natural Language processing. ArXiv Preprint arXiv:2107 13586, 3 (2021)"},{"issue":"9","key":"2267_CR24","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"J Kaiyang Zhou","year":"2022","unstructured":"Kaiyang Zhou, J., Yang, C.C., Loy, Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision. 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"2267_CR25","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollar, P.: and C Lawrence Zitnick. Microsoft coco: Common objects in context. In European conference on computer vision, pages 740\u2013755. Springer, 5 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2267_CR26","doi-asserted-by":"crossref","unstructured":"Holger Caesar, J., Uijlings, Ferrari, V.: Cocostuff: Thing and stuff classes in context. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 1209\u20131218, 2, 4, 5 (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"2267_CR27","unstructured":"Chen, X., Fang, H., Lin, T.-Y., Vedantam, R., Gupta, S., Dollar, P.: and C Lawrence Zitnick. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325, 2015. 2, 5, 7"},{"issue":"3","key":"2267_CR28","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"H Bolei Zhou","year":"2019","unstructured":"Bolei Zhou, H., Zhao, X., Puig, T., Xiao, S., Fidler, A., Barriuso, Torralba, A.: Semantic Understanding of scenes through the ade20k dataset. Int. J. Comput. Vision. 127(3), 302\u2013321 (2019)","journal-title":"Int. J. Comput. Vision"},{"issue":"2","key":"2267_CR29","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K.I., Winn, J., Andrew Zisserman: The Pascal visual object classes (voc) challenge. Int. J. Comput. Vision. 88(2), 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vision"},{"key":"2267_CR30","doi-asserted-by":"crossref","unstructured":"Roozbeh Mottaghi, X., Chen, X., Liu, N.-G., Cho, S.-W., Lee, S., Fidler: Raquel Urtasun, and Alan Yuille. The role of context for object detection and semantic segmentation in the wild. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 891\u2013898, 2, 5 (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"2267_CR31","unstructured":"Ilya Loshchilov and Frank Hutter. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101: 6 (2017)"},{"key":"2267_CR32","doi-asserted-by":"crossref","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K.: and Alan L. Yuille. Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected Crfs. TPAMI, 1 (2018)","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"2267_CR33","doi-asserted-by":"crossref","unstructured":"Ding, H., Jiang, X., Shuai, B., Liu, A.Q., Wang, G.: Context contrasted feature and gated multiscale aggregation for scene segmentation. In CVPR, 1 (2018)","DOI":"10.1109\/CVPR.2018.00254"},{"key":"2267_CR34","doi-asserted-by":"crossref","unstructured":"Fang, Y., Zhu, F., Cheng, B., Liu, L., Wei, Y., Zhao, Y.: Locating noise is halfway denoising for semi-supervised segmentatio. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, 1 (2023)","DOI":"10.1109\/ICCV51070.2023.01523"},{"issue":"4","key":"2267_CR35","doi-asserted-by":"publisher","first-page":"1224","DOI":"10.3390\/s25041224","volume":"25","author":"S Muksimova","year":"2025","unstructured":"Muksimova, S., Valikhujaev, Y., Umirzakova, S., Baltayev, J.: Cho. GazeCapsNet: A lightweight gaze Estimation framework. Sensors. 25(4), 1224 (2025)","journal-title":"Sensors"},{"key":"2267_CR36","unstructured":"Guo, M.-H., Lu, C., Hou, Q., Liu, Z.-N., Cheng, M.-M., Shi-Min, H.: Segnext: Rethinking convolutional attention design for semantic segmentation. arXiv preprint arXiv:2209.08575, 1 (2022)"},{"key":"2267_CR37","unstructured":"Jonathan, L.: Evan Shelhamer, and Trevor Darrell. Fully convolutional networks for semantic segmentation. In CVPR, 1 (2015)"},{"key":"2267_CR38","unstructured":"Mengxue Qu, Y., Wu, Y., Wei, W., Liu, X., Liang, Zhao, Y.: Learning to segment every referring object point by point. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 1 (2023)"},{"key":"2267_CR39","doi-asserted-by":"crossref","unstructured":"Olaf Ronneberger, P., Fischer, Brox, T.: U-net: Convolutional networks for biomedical image segmentation. In MICCAI, 1 (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"2267_CR40","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: Simple and efficient design for semantic segmentation with transformers. In NIPS, 1 (2021)"},{"key":"2267_CR41","unstructured":"Golnaz Ghiasi, X., Gu, Y., Cui, Tsung-Yi, Lin: Open-vocabulary image segmentation. arXiv preprint arXiv: 2112.12143, 1, 2, 6 (2021)"},{"key":"2267_CR42","doi-asserted-by":"crossref","unstructured":"Kunyang Han, Y., Liu, J.H., Liew, H., Ding, J., Liu, Y., Wang, Y., Tang, Y., Yang, J., Feng, Y., Zhao, et al.: Global knowledge calibration for fast open-vocabulary segmentation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 797\u2013807, 1, 2 (2023)","DOI":"10.1109\/ICCV51070.2023.00080"},{"key":"2267_CR43","doi-asserted-by":"crossref","unstructured":"He, S., Ding, H., Jiang, W.: Primitive generation and semantic-related alignment for universal zero-shot segmentation. In CVPR, 1 (2023)","DOI":"10.1109\/CVPR52729.2023.01081"},{"issue":"1","key":"2267_CR44","first-page":"6","volume":"3","author":"B Li","year":"2022","unstructured":"Li, B., Weinberger, K.Q., Belongie, S.J., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. ICLR. 3(1), 6 (2022)","journal-title":"ICLR"},{"key":"2267_CR45","unstructured":"Liu, Y., Zhang, C., Wang, Y., Wang, J., Yang, Y., and Yansong Tang:. Universal segmentation at arbitrary granularity with language instruction. arXiv preprint arXiv:2312.01623, 1 (2023)"},{"key":"2267_CR46","doi-asserted-by":"publisher","first-page":"112827","DOI":"10.1016\/j.knosys.2024.112827","volume":"309","author":"Souvik Chowdhury and Badal Soni","year":"2025","unstructured":"Souvik Chowdhury and Badal Soni: R-VQA: A robust visual question answering model. Knowl. Based Syst. 309, 112827 (2025)","journal-title":"Knowl. Based Syst."},{"key":"2267_CR47","doi-asserted-by":"crossref","unstructured":"Souvik Chowdhury and Badal Soni: Beyond words: ESC-Net revolutionizes VQA by elevating visual features and defying Language priors. Comput. Intell., 1 (2024)","DOI":"10.1111\/coin.70010"},{"key":"2267_CR48","doi-asserted-by":"publisher","first-page":"109948","DOI":"10.1016\/j.engappai.2024.109948","volume":"142","author":"Souvik Chowdhury and Badal Soni","year":"2025","unstructured":"Souvik Chowdhury and Badal Soni: Improving visual question answering model by enriching the visual feature. Eng. Appl. Artif. Intell. 142, 109948 (2025)","journal-title":"Eng. Appl. Artif. Intell."},{"key":"2267_CR49","doi-asserted-by":"publisher","first-page":"10479","DOI":"10.1007\/s13369-023-07661-8","volume":"48","author":"Souvik Chowdhury and Badal Soni","year":"2023","unstructured":"Souvik Chowdhury and Badal Soni: A time efficient, scalable and optimized VQA framework. Arab. J. Sci. Eng. 48, 10479\u201310491 (2023)","journal-title":"Arab. J. Sci. Eng."},{"key":"2267_CR50","doi-asserted-by":"publisher","first-page":"129906","DOI":"10.1016\/j.neucom.2025.129906","volume":"635","author":"Souvik Chowdhury and Badal Soni","year":"2025","unstructured":"Souvik Chowdhury and Badal Soni: Handling Language prior and compositional reasoning issues in visual question answering system. Neurocomputing. 635, 129906 (2025)","journal-title":"Neurocomputing"},{"key":"2267_CR51","first-page":"2","volume":"1","author":"S Yongqin Xian","year":"2019","unstructured":"Yongqin Xian, S., Choudhury, Y., He, B., Schiele, Akata, Z.: Semantic projection network for zero- and few-label semantic segmentation. CVPR. 1, 2 (2019)","journal-title":"CVPR"},{"key":"2267_CR52","doi-asserted-by":"crossref","unstructured":"Hui Zhang and Henghui Ding: Prototypical matching and open set rejection for zero-shot semantic segmentation. In ICCV, 1 (2021)","DOI":"10.1109\/ICCV48922.2021.00689"},{"key":"2267_CR53","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., De Mello, S.: Open-vocabulary panoptic segmentation with text-to-image diffusion models. In CVPR, pages 2955\u20132966, 1, 2, 6 (2023)","DOI":"10.1109\/CVPR52729.2023.00289"},{"issue":"5","key":"2267_CR54","first-page":"8","volume":"2","author":"H Bolei Zhou","year":"2017","unstructured":"Bolei Zhou, H., Zhao, X., Puig, S., Fidler: Adela Barriuso, and Antonio Torralba. Scene parsing through ADE20K dataset. CVPR. 2(5), 6, 8 (2017)","journal-title":"CVPR"},{"key":"2267_CR55","doi-asserted-by":"crossref","unstructured":"Chen, L., Yang, Q., Ding, K., et al.: Efficient redundancy reduction for Open-Vocabulary semantic Segmentation[J]. (2025). arXiv preprint arXiv:2501.17642","DOI":"10.1016\/j.neucom.2025.132229"},{"key":"2267_CR56","unstructured":"Pang, L., Yao, J., Li, K., et al.: SPECIAL: Zero-shot hyperspectral image classification with CLIP[J]. (2025). arXiv preprint arXiv:2501.16222"},{"key":"2267_CR57","unstructured":"Sun, H., Gong, R., Nejjar, I., et al.: DynAlign: Unsupervised dynamic taxonomy alignment for Cross-Domain Segmentation[J]. (2025). arXiv preprint arXiv:2501.16410"},{"key":"2267_CR58","unstructured":"Zhang, D., Feng, T., Xue, L., et al.: Parameter-Efficient Fine-Tuning for foundation Models[J]. (2025). arXiv preprint arXiv:2501.13787"},{"key":"2267_CR59","unstructured":"Li, K., Cao, X., Deng, Y., et al.: DynamicEarth: How Far are we from Open-Vocabulary change Detection?[J]. (2025). arXiv preprint arXiv:2501.12931"},{"key":"2267_CR60","doi-asserted-by":"publisher","first-page":"621","DOI":"10.1016\/j.isprsjprs.2025.01.006","volume":"220","author":"V Zermatten","year":"2025","unstructured":"Zermatten, V., Castillo-Navarro, J., Marcos, D., et al.: Learning transferable land cover semantics for open vocabulary interactions with remote sensing images[J]. ISPRS J. Photogrammetry Remote Sens. 220, 621\u2013636 (2025)","journal-title":"ISPRS J. Photogrammetry Remote Sens."},{"key":"2267_CR61","doi-asserted-by":"crossref","unstructured":"Choi, J., Lee, S., Lee, M., et al.: Fine-Grained Image-Text correspondence with cost aggregation for Open-Vocabulary part Segmentation[J]. (2025). arXiv preprint arXiv:2501.09688","DOI":"10.1109\/CVPR52734.2025.00914"},{"key":"2267_CR62","doi-asserted-by":"crossref","unstructured":"Bai, M., Yu, X., Wang, Y., et al.: Enhancing pixel-level analysis in medical imaging through visual instruction tuning: Introducing PLAMi[J]. Visual Comput., : 1\u201317. (2024)","DOI":"10.1007\/s00371-024-03742-3"},{"key":"2267_CR63","doi-asserted-by":"crossref","unstructured":"Zhou, E., Su, Q., Chi, C., et al.: Code-as-Monitor: Constraint-aware visual programming for reactive and proactive robotic failure Detection[J]. (2024). arXiv preprint arXiv:2412.04455","DOI":"10.1109\/CVPR52734.2025.00649"},{"key":"2267_CR64","doi-asserted-by":"crossref","unstructured":"Huang, C., Yan, S., Burgard, W.: BYE: Build your encoder with one sequence of exploration data for Long-Term dynamic scene Understanding[J]. (2024). arXiv preprint arXiv:2412.02449","DOI":"10.1109\/LRA.2025.3542693"},{"key":"2267_CR65","doi-asserted-by":"crossref","unstructured":"Dao, S.D., Shi, H., Phung, D.Q., et al.: CA-Ovs: Cluster and Adapt Mask Proposals for Open-Vocabulary Semantic Segmentation[C]\/\/Proceedings of the 6th ACM International Conference on Multimedia in Asia. : 1\u20138. (2024)","DOI":"10.1145\/3696409.3700213"},{"key":"2267_CR66","unstructured":"Maxime Bucher, T.-H., Vu: Matthieu Cord, and Patrick Perez. Zero-shot semantic segmentation. In NeurIPS, 2 (2019)"},{"issue":"1","key":"2267_CR67","doi-asserted-by":"crossref","first-page":"6","DOI":"10.37357\/1068\/jser.2.1.02","volume":"2","author":"JW Alec Radford","year":"2021","unstructured":"Alec Radford, J.W., Kim, C., Hallacy, A., Ramesh, G., Goh, S., Agarwal, G., Sastry, A., Askell: Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. Learning transferable visual models from natural Language supervision. ICML. 2(1), 6 (2021)","journal-title":"ICML"},{"key":"2267_CR68","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In CVPR, pages 770\u2013778, 6 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2267_CR69","unstructured":"Mingxing Tan and Quoc Le: Efficientnet: Rethinking model scaling for convolutional neural networks. In ICML, pages 6105\u20136114, 6 (2019)"},{"key":"2267_CR70","unstructured":"Bowen Cheng, A.G., Schwing, Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. In NeurIPS, 2 (2021)"},{"key":"2267_CR71","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, F., Xu, H., Huang, S., Liu, S., Lionel, M., Ni, Zhang, L.: Mp-former: Mask-piloted transformer for image segmentation. ArXiv Preprint (2023). arXiv:2303.07336","DOI":"10.1109\/CVPR52729.2023.01733"},{"key":"2267_CR72","doi-asserted-by":"crossref","unstructured":"Cavagnero, N., Rosi, G., Cuttano, C., et al.: Pem: Prototype-based efficient maskformer for image segmentation[C]\/\/Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. : 15804\u201315813. (2024)","DOI":"10.1109\/CVPR52733.2024.01496"},{"key":"2267_CR73","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., et al.: Swin transformer: Hierarchical vision transformer using shifted windows[C]\/\/Proceedings of the IEEE\/CVF international conference on computer vision. : 10012\u201310022. (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2267_CR74","doi-asserted-by":"crossref","unstructured":"Xu, J., De Mello, S., Liu, S., Byeon, W., Breuel, T.M., Kautz, J., Wang, X.: Groupvit: Semantic segmentation emerges from text supervision. In CVPR, 6 (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"2267_CR75","unstructured":"Feng Liang, B., Wu, X., Dai, K., Li, Y., Zhao, H., Zhang, P., Zhang, P., Vajda, Marculescu, D.: Open-vocabulary semantic segmentation with mask-adapted CLIP. arXiv preprint arXiv:2210.04150, 2022. 2, 5, 6, 7"},{"issue":"5","key":"2267_CR76","first-page":"7","volume":"6","author":"J Siyu","year":"2023","unstructured":"Siyu, J., Wei, Y., Wang, Y., Zhao, Y., Humphrey, Shi: Learning mask-aware clip representations for zero-shot segmentation. ArXiv Preprint arXiv:2310 00240. 6(5), 7 (2023)","journal-title":"ArXiv Preprint arXiv:2310 00240"},{"key":"2267_CR77","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Han Hu, and, Bai, X.: Side adapter network for open-vocabulary semantic segmentation. In CVPR, pages 2945\u20132954, 2, 5, 6, 7 (2023)","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"2267_CR78","unstructured":"Zheng Ding, J., Wang, Tu, Z.: Open vocabulary panoptic segmentation with maskclip. arXiv preprint arXiv:2208.08984, 6 (2022)"},{"key":"2267_CR79","unstructured":"Xu, Y.-H., Wang, Z.-H., Wang, Z.-R., Fan, R., Wang, X.A.: Recommendation Algorithm Based on a Self-supervised Learning Pretrain Transformer"},{"key":"2267_CR80","unstructured":"Xu, Y.H., Wang, Z.H., Wang, Z.R., Guo, Y.L., Fan, R., Tian, H.Y., Wang: Xing SimDCL: dropout-based simple graph contrastive learning for recommendation"},{"key":"2267_CR81","unstructured":"Chen, H., Zhang, F., Li, Q., Li, X., Ding, Y., Zhang, D., Cheng, J., Wang: Xing Triple confidence-aware encoder-decoder model for commonsense knowledge graph completion"},{"key":"2267_CR82","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X.: Shalini De Mello. Open-Vocabulary Panoptic Segmentation with Text-to-Image Diffusion Models.arXiv: 2303. 04803, 3 (2023)","DOI":"10.1109\/CVPR52729.2023.00289"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-026-02267-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-026-02267-0","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-026-02267-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T12:33:39Z","timestamp":1779280419000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-026-02267-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,10]]},"references-count":82,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["2267"],"URL":"https:\/\/doi.org\/10.1007\/s00530-026-02267-0","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-6850046\/v1","asserted-by":"object"}]},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,10]]},"assertion":[{"value":"9 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 February 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Declaration of generative AI and AI-assisted technologies in the writing process\n                      .","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"During the preparation of this work, the author(s) used ChatGPT-4.0 to improve the language and readability of the manuscript. After using this tool, the author(s) reviewed and edited the content as needed and take(s) full responsibility for the content of the publication.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"197"}}