{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T16:10:38Z","timestamp":1780071038683,"version":"3.54.0"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T00:00:00Z","timestamp":1780012800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T00:00:00Z","timestamp":1780012800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Beijing Natural Science Foundation","award":["L252009"],"award-info":[{"award-number":["L252009"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62571294"],"award-info":[{"award-number":["62571294"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62476069"],"award-info":[{"award-number":["62476069"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"CCF-DiDi GAIA Collaborative Research Funds"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1007\/s11263-026-02886-0","type":"journal-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T15:55:53Z","timestamp":1780070153000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["CMPF: Harmonizing Cross-Model Prior Fusion for Open-Vocabulary Segmentation"],"prefix":"10.1007","volume":"134","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5843-6411","authenticated-orcid":false,"given":"Sicheng","family":"Zhao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xi","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hongxun","family":"Yao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Haosen","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yanhao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sheng","family":"Jin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiatian","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Haonan","family":"Lu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kui","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guiguang","family":"Ding","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,5,29]]},"reference":[{"key":"2886_CR1","unstructured":"Chen, Z., Duan, Y., Wang, W., He, J., Lu, T., Dai, J., & Qiao, Y. (2023). Vision transformer adapter for dense predictions. International Conference on Learning Representations."},{"key":"2886_CR2","doi-asserted-by":"crossref","unstructured":"Chen, X., Li, S., Lim, S.-N., Torralba, A. & Zhao, H. (2023). Open-vocabulary panoptic segmentation with embedding modulation. In: IEEE\/CVF International Conference on Computer Vision, pp. 1141\u20131150 .","DOI":"10.1109\/ICCV51070.2023.00111"},{"key":"2886_CR3","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102311","volume":"107","author":"Q Chen","year":"2024","unstructured":"Chen, Q., Chen, Y., Huang, Y., Xie, X., & Yang, L. (2024). Region-based online selective examination for weakly supervised semantic segmentation. Information Fusion, 107, Article 102311.","journal-title":"Information Fusion"},{"key":"2886_CR4","unstructured":"Cheng, B., Misra, I., Schwing, A. G., Kirillov, A., & Girdhar, R. (2021). Masked-attention mask transformer for universal image segmentation. Advances in Neural Information Processing Systems (pp. 1290\u20131299)."},{"key":"2886_CR5","doi-asserted-by":"crossref","unstructured":"Cordts, M., Omran, M., Ramos, S., Rehfeld, T., Enzweiler, M., Benenson, R., Franke, U., Roth, S., & Schiele, B. (2016). The cityscapes dataset for semantic urban scene understanding. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 13213\u201313223).","DOI":"10.1109\/CVPR.2016.350"},{"key":"2886_CR6","unstructured":"Ding, Z., Wang, J., & Tu, Z. (2023). Open-vocabulary universal image segmentation with MaskCLIP. International Conference on Machine Learning (Vol. 202, pp. 8090\u20138102)."},{"key":"2886_CR7","doi-asserted-by":"crossref","unstructured":"Ding, J., Xue, N., Xia, G.-S., & Dai, D. (2022). Decoupling zero-shot semantic segmentation. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 11573\u201311582).","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"2886_CR8","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., & Houlsby, N. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. International Conference on Learning Representations."},{"issue":"2","key":"2886_CR9","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C. K. I., Winn, J., & Zisserman, A. (2010). The pascal visual object classes (VOC) challenge. International Journal of Computer Vision, 88(2), 303\u2013338.","journal-title":"International Journal of Computer Vision"},{"key":"2886_CR10","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., Gu, X., Cui, Y., & Lin, T. Y. (2022). Scaling open-vocabulary image segmentation with image-level labels. European Conference on Computer Vision (pp. 540\u2013557).","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"2886_CR11","doi-asserted-by":"crossref","unstructured":"Gupta, A., Doll\u00e1r, P., & Girshick, R. (2019). Lvis: A dataset for large vocabulary instance segmentation. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 5351\u20135359).","DOI":"10.1109\/CVPR.2019.00550"},{"key":"2886_CR12","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. International Conference on Machine Learning (Vol. 139, pp. 4904\u20134916)."},{"key":"2886_CR13","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A. C., Lo, W.-Y., Dollar, P., & Girshick, R. (2023). Segment anything. IEEE\/CVF International Conference on Computer Vision (pp. 4015\u20134026).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2886_CR14","doi-asserted-by":"crossref","unstructured":"Li, Y. (2022). Exploring plain vision transformer backbones for object detection. European Conference on Computer Vision (pp. 280\u2013296).","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"2886_CR15","doi-asserted-by":"crossref","unstructured":"Li, Y., Fan, J., Pan, Y., Yao, T., Lin, W. & Mei, T. (2022). Uni-eden: Universal encoder-decoder network by multi-granular vision-language pre-training. ACM Transactions on Multimedia Computing, Communications, and Applications 18(2) .","DOI":"10.1145\/3473140"},{"key":"2886_CR16","doi-asserted-by":"crossref","unstructured":"Liang, F., Wu, B., Dai, X., Li, K., Zhao, Y., Zhang, H., Zhang, P., Vajda, P. & Marculescu, D. (2023). Open-vocabulary semantic segmentation with mask-adapted clip. In: IEEE\/CVF International Conference on Computer Vision, pp. 7061\u20137070 .","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"2886_CR17","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102631","volume":"113","author":"J Li","year":"2025","unstructured":"Li, J., Chen, T., Wang, X., Zhong, Y., & Xiao, X. (2025). Adapting the segment anything model for multi-modal retinal anomaly detection and localization. Information Fusion, 113, Article 102631.","journal-title":"Information Fusion"},{"key":"2886_CR18","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Bourdev, L., Girshick, R., Hays, J., Perona, P., Ramanan, D., Zitnick, C. L., & Doll\u00e1r, P. (2014). Microsoft COCO: Common objects in context. European Conference on Computer Vision (pp. 740\u2013755).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2886_CR19","doi-asserted-by":"crossref","unstructured":"Liu, Y., Bai, S., Li, G., Wang, Y., & Tang, Y. (2024). Open-vocabulary segmentation with semantic-assisted calibration. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 3491\u20133500).","DOI":"10.1109\/CVPR52733.2024.00335"},{"key":"2886_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Y., Ge, P., Wang, G., Liu, Q., & Huang, D. (2025). Multi-grained contrastive learning for text-supervised open-vocabulary semantic segmentation. ACM Transactions on Multimedia Computing, Communications, and Applications 21(3) .","DOI":"10.1145\/3711868"},{"key":"2886_CR21","first-page":"1","volume":"15","author":"J Ma","year":"2024","unstructured":"Ma, J., He, Y., Li, F., Han, L., You, C., & Wang, B. (2024). Segment anything in medical images. Nature Communications, 15, 1\u20139.","journal-title":"Nature Communications"},{"key":"2886_CR22","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., Chen, X., Liu, X., Cho, N.-G., Lee, S.-W., Fidler, S., Urtasun, R., & Yuille, A. (2014). The role of context for object detection and semantic segmentation in the wild. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 891\u2013898).","DOI":"10.1109\/CVPR.2014.119"},{"key":"2886_CR23","doi-asserted-by":"crossref","unstructured":"Niu, H., Hu, J., Lin, J., Jiang, G., & Zhang, S. (2025). Eov-seg: efficient open-vocabulary panoptic segmentation. AAAI Conference on Artificial Intelligence.","DOI":"10.1609\/aaai.v39i6.32669"},{"key":"2886_CR24","doi-asserted-by":"crossref","unstructured":"Pan, F., Shin, I., Rameau, F., Lee, S., & Kweon, I. S. (2020). Unsupervised intra-domain adaptation for semantic segmentation through self-supervision. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 3763\u20133772).","DOI":"10.1109\/CVPR42600.2020.00382"},{"key":"2886_CR25","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. International Conference on Machine Learning (Vol. 139, pp. 8748\u20138763)."},{"key":"2886_CR26","doi-asserted-by":"crossref","unstructured":"Rao, Y., Zhao, W., Chen, G., Tang, Y., Zhu, Z., Huang, G., Zhou, J., & Lu, J. (2022). DenseCLIP: Language-guided dense prediction with context-aware prompting. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 18061\u201318070).","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"2886_CR27","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 10684\u201310695).","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2886_CR28","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, L., & Polosukhin, I. (2017). Attention is all you need. Advances in Neural Information Processing Systems (Vol. 30)."},{"key":"2886_CR29","unstructured":"Wu, S., Zhang, W., Xu, L., Jin, S., Li, X., Liu, W., & Loy, C.C. (2024). CLIPSelf: Vision transformer distills itself for open-vocabulary dense prediction. In: International Conference on Learning Representations ."},{"key":"2886_CR30","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., & De Mello, S. (2023). Open-vocabulary panoptic segmentation with text-to-image diffusion models. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 2955\u20132966).","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"2886_CR31","doi-asserted-by":"crossref","unstructured":"Xu, X., Xiong, T., Ding, Z., & Tu, Z. (2023). Masqclip for open-vocabulary universal image segmentation. IEEE\/CVF International Conference on Computer Vision (pp. 887\u2013898).","DOI":"10.1109\/ICCV51070.2023.00088"},{"key":"2886_CR32","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Lin, Y., Cao, Y., Hu, H., & Bai, X. (2022). A simple baseline for open-vocabulary semantic segmentation with pre-trained vision-language model. European Conference on Computer Vision (pp. 736\u2013753).","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"2886_CR33","doi-asserted-by":"crossref","unstructured":"Yang, H., Ma, C., Wen, B., Jiang, Y., Yuan, Z., & Zhu, X. (2024). Recognize any regions. Advances in Neural Information Processing Systems (pp. 51312\u201351332).","DOI":"10.52202\/079017-1624"},{"key":"2886_CR34","doi-asserted-by":"crossref","unstructured":"Yu, F., Chen, H., Wang, X., Xian, W., Chen, Y., Liu, F., Madhavan, V., & Darrell, T. (2020). BDD100k: A diverse driving dataset for heterogeneous multitask learning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2636\u20132645 .","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"2886_CR35","doi-asserted-by":"crossref","unstructured":"Yu, Q., He, J., Deng, X., Shen, X. & Chen, L.-C. (2023). Convolutions die hard: Open-vocabulary segmentation with single frozen convolutional CLIP. Advances in Neural Information Processing Systems (Vol. 36, pp. 32215\u201332234).","DOI":"10.52202\/075280-1399"},{"key":"2886_CR36","doi-asserted-by":"crossref","unstructured":"Yuan, H., Li, X., Zhou, C., Li, Y., Chen, K., & Loy, C.C. (2024). Open-vocabulary SAM: Segment and recognize twenty-thousand classes interactively. In: European Conference on Computer Vision, pp. 419\u2013437 .","DOI":"10.1007\/978-3-031-72775-7_24"},{"key":"2886_CR37","unstructured":"Zhang, C., Han, D., Qiao, Y., Kim, J. U., Bae, S.-H., Lee, S., & Hong, C. S. (2023). Faster segment anything: Towards lightweight sam for mobile applications arXiv preprint. arXiv:2306.14289"},{"issue":"8","key":"2886_CR38","doi-asserted-by":"publisher","first-page":"3119","DOI":"10.1007\/s11263-024-02022-w","volume":"132","author":"X Zhao","year":"2024","unstructured":"Zhao, X., Feng, W., Zhang, Z., Lv, J., Zhu, X., Lin, Z., Hu, J., & Shao, J. (2024). Cbnet: A plug-and-play network for segmentation-based scene text detection. International Journal of Computer Vision, 132(8), 3119\u20133138.","journal-title":"International Journal of Computer Vision"},{"issue":"8","key":"2886_CR39","doi-asserted-by":"publisher","first-page":"2399","DOI":"10.1007\/s11263-021-01479-3","volume":"129","author":"S Zhao","year":"2021","unstructured":"Zhao, S., Li, B., Xu, P., Yue, X., Ding, G., & Keutzer, K. (2021). Madan: Multi-source adversarial domain aggregation network for domain adaptation. International Journal of Computer Vision, 129(8), 2399\u20132424.","journal-title":"International Journal of Computer Vision"},{"key":"2886_CR40","doi-asserted-by":"crossref","unstructured":"Zhao, S., Yao, H., Lin, C., Gao, Y., & Ding, G. (2024). Multi-source-free domain adaptive object detection. International Journal of Computer Vision,132(12), 5950\u20135982.","DOI":"10.1007\/s11263-024-02170-z"},{"key":"2886_CR41","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Xiao, T., Fidler, S., Barriuso, A., & Torralba, A. (2017). Semantic understanding of scenes through the ADE20k dataset. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 302\u2013321).","DOI":"10.1007\/s11263-018-1140-0"},{"issue":"9","key":"2886_CR42","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022). Learning to prompt for vision-language models. International Journal of Computer Vision, 130(9), 2337\u20132348.","journal-title":"International Journal of Computer Vision"},{"key":"2886_CR43","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., & Dai, J. (2021). Deformable detr: Deformable transformers for end-to-end object detection. International Conference on Learning Representations."},{"key":"2886_CR44","doi-asserted-by":"crossref","unstructured":"Zou*, X., Dou*, Z.-Y., Yang*, J., Gan, Z., Li, L., Li, C., Dai, X., Behl, H., Wang, J., Yuan, L., Peng, N., Wang, L., Lee*, Y.J., & Gao*, J.: Generalized decoding for pixel, image and language. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15116\u201315127 (2023).","DOI":"10.1109\/CVPR52729.2023.01451"},{"key":"2886_CR45","doi-asserted-by":"crossref","unstructured":"Zou*, X., Yang*, J., Zhang*, H., Li*, F., Li, L., Wang, J., Wang, L., Gao*, J., & Lee*, Y.J. (2023). Segment everything everywhere all at once. In: Advances in Neural Information Processing Systems, pp. 19769\u201319782 .","DOI":"10.52202\/075280-0868"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02886-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02886-0","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02886-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T15:56:04Z","timestamp":1780070164000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02886-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,29]]},"references-count":45,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["2886"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02886-0","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5,29]]},"assertion":[{"value":"12 December 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 May 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 May 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no relevant financial or non-financial interests to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of Interest"}}],"article-number":"296"}}