{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T17:44:58Z","timestamp":1757612698036,"version":"3.44.0"},"reference-count":70,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2025,5,6]],"date-time":"2025-05-06T00:00:00Z","timestamp":1746489600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,6]],"date-time":"2025-05-06T00:00:00Z","timestamp":1746489600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Ningbo Municipal Natural Science Foundation of China","award":["No.2022J114"],"award-info":[{"award-number":["No.2022J114"]}]},{"name":"Innovation Challenge Project of China","award":["No. 2022T001"],"award-info":[{"award-number":["No. 2022T001"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["No. 62271274"],"award-info":[{"award-number":["No. 62271274"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Ningbo S&T Project","award":["No.2024Z004"],"award-info":[{"award-number":["No.2024Z004"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s00530-025-01806-5","type":"journal-article","created":{"date-parts":[[2025,5,6]],"date-time":"2025-05-06T07:07:39Z","timestamp":1746515259000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing open-vocabulary object detection through region-word and region-vision matching"],"prefix":"10.1007","volume":"31","author":[{"given":"Yi","family":"Chen","sequence":"first","affiliation":[]},{"given":"Chong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zhehao","family":"Li","sequence":"additional","affiliation":[]},{"given":"Sunqi","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Jinhui","family":"Xiang","sequence":"additional","affiliation":[]},{"given":"Yuqi","family":"Li","sequence":"additional","affiliation":[]},{"given":"Jiangbo","family":"Qian","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,6]]},"reference":[{"key":"1806_CR1","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)"},{"key":"1806_CR2","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: Unified, real-time object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 779\u2013788 (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"1806_CR3","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"1806_CR4","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: European Conference on Computer Vision, pp. 213\u2013229 (2020). Springer","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"1806_CR5","doi-asserted-by":"crossref","unstructured":"Sun, P., Zhang, R., Jiang, Y., Kong, T., Xu, C., Zhan, W., Tomizuka, M., Li, L., Yuan, Z., Wang, C., et al.: Sparse r-cnn: End-to-end object detection with learnable proposals. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14454\u201314463 (2021)","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"1806_CR6","unstructured":"Chen, X., Fang, H., Lin, T.-Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"1806_CR7","doi-asserted-by":"crossref","unstructured":"Bravo, M.A., Mittal, S., Brox, T.: Localized vision-language matching for open-vocabulary object detection. In: DAGM German Conference on Pattern Recognition, pp. 393\u2013408 (2022). Springer","DOI":"10.1007\/978-3-031-16788-1_24"},{"key":"1806_CR8","doi-asserted-by":"crossref","unstructured":"Zhou, X., Girdhar, R., Joulin, A., Kr\u00e4henb\u00fchl, P., Misra, I.: Detecting twenty-thousand classes using image-level supervision. In: European Conference on Computer Vision, pp. 350\u2013368 (2022). Springer","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"1806_CR9","unstructured":"Lin, C., Sun, P., Jiang, Y., Luo, P., Qu, L., Haffari, G., Yuan, Z., Cai, J.: Learning object-language alignments for open-vocabulary object detection. arXiv preprint arXiv:2211.14843 (2022)"},{"key":"1806_CR10","doi-asserted-by":"crossref","unstructured":"Zhang, H., Zhao, Q., Zheng, L., Zeng, H., Ge, Z., Li, T., Xu, S.: Exploring region-word alignment in built-in detector for open-vocabulary object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16975\u201316984 (2024)","DOI":"10.1109\/CVPR52733.2024.01606"},{"key":"1806_CR11","doi-asserted-by":"crossref","unstructured":"Kim, D., Angelova, A., Kuo, W.: Region-aware pretraining for open-vocabulary object detection with vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11144\u201311154 (2023)","DOI":"10.1109\/CVPR52729.2023.01072"},{"key":"1806_CR12","doi-asserted-by":"crossref","unstructured":"Kim, D., Angelova, A., Kuo, W.: Contrastive feature masking open-vocabulary vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15602\u201315612 (2023)","DOI":"10.1109\/ICCV51070.2023.01430"},{"key":"1806_CR13","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021). PMLR"},{"key":"1806_CR14","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916 (2021). PMLR"},{"key":"1806_CR15","unstructured":"Gu, X., Lin, T.-Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)"},{"key":"1806_CR16","doi-asserted-by":"crossref","unstructured":"Wu, S., Zhang, W., Jin, S., Liu, W., Loy, C.C.: Aligning bag of regions for open-vocabulary object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15254\u201315264 (2023)","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"1806_CR17","doi-asserted-by":"crossref","unstructured":"Li, J., Zhang, J., Li, J., Li, G., Liu, S., Lin, L., Li, G.: Learning background prompts to discover implicit knowledge for open vocabulary object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16678\u201316687 (2024)","DOI":"10.1109\/CVPR52733.2024.01578"},{"key":"1806_CR18","doi-asserted-by":"crossref","unstructured":"Xu, Y., Zhang, M., Yang, X., Xu, C.: Exploring multi-modal contextual knowledge for open-vocabulary object detection. IEEE Transactions on Image Processing (2024)","DOI":"10.1109\/TIP.2024.3485518"},{"key":"1806_CR19","doi-asserted-by":"crossref","unstructured":"Minderer, M., Gritsenko, A., Stone, A., Neumann, M., Weissenborn, D., Dosovitskiy, A., Mahendran, A., Arnab, A., Dehghani, M., Shen, Z., et al.: Simple open-vocabulary object detection. In: European Conference on Computer Vision, pp. 728\u2013755 (2022). Springer","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"1806_CR20","unstructured":"Kuo, W., Cui, Y., Gu, X., Piergiovanni, A., Angelova, A.: F-vlm: Open-vocabulary object detection upon frozen vision and language models. arXiv preprint arXiv:2209.15639 (2022)"},{"key":"1806_CR21","doi-asserted-by":"crossref","unstructured":"Du, Y., Wei, F., Zhang, Z., Shi, M., Gao, Y., Li, G.: Learning to prompt for open-vocabulary object detection with vision-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14084\u201314093 (2022)","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"1806_CR22","doi-asserted-by":"crossref","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Open-vocabulary detr with conditional matching. In: European Conference on Computer Vision, pp. 106\u2013122 (2022). Springer","DOI":"10.1007\/978-3-031-20077-9_7"},{"issue":"1","key":"1806_CR23","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1109\/TPAMI.2016.2535231","volume":"39","author":"RG Cinbis","year":"2016","unstructured":"Cinbis, R.G., Verbeek, J., Schmid, C.: Weakly supervised object localization with multi-fold multiple instance learning. IEEE Trans. Pattern Anal. Mach. Intell. 39(1), 189\u2013203 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1806_CR24","doi-asserted-by":"crossref","unstructured":"Bilen, H., Pedersoli, M., Tuytelaars, T.: Weakly supervised object detection with convex clustering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1081\u20131089 (2015)","DOI":"10.1109\/CVPR.2015.7298711"},{"key":"1806_CR25","doi-asserted-by":"crossref","unstructured":"Bilen, H., Vedaldi, A.: Weakly supervised deep detection networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2846\u20132854 (2016)","DOI":"10.1109\/CVPR.2016.311"},{"key":"1806_CR26","doi-asserted-by":"crossref","unstructured":"Wan, F., Liu, C., Ke, W., Ji, X., Jiao, J., Ye, Q.: C-mil: Continuation multiple instance learning for weakly supervised object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2199\u20132208 (2019)","DOI":"10.1109\/CVPR.2019.00230"},{"key":"1806_CR27","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009). Ieee","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1806_CR28","first-page":"33781","volume":"35","author":"H Bangalath","year":"2022","unstructured":"Bangalath, H., Maaz, M., Khattak, M.U., Khan, S.H., Shahbaz Khan, F.: Bridging the gap between object and image-level representations for open-vocabulary detection. Adv. Neural. Inf. Process. Syst. 35, 33781\u201333794 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1806_CR29","unstructured":"Zhou, X., Koltun, V., Kr\u00e4henb\u00fchl, P.: Probabilistic two-stage detection. arXiv preprint arXiv:2103.07461 (2021)"},{"key":"1806_CR30","unstructured":"Detector, A.-F.O.: Fcos: A simple and strong anchor-free object detector. IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE 44(4) (2022)"},{"key":"1806_CR31","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"1806_CR32","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1440\u20131448 (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"1806_CR33","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"key":"1806_CR34","doi-asserted-by":"crossref","unstructured":"Chen, K., Pang, J., Wang, J., Xiong, Y., Li, X., Sun, S., Feng, W., Liu, Z., Shi, J., Ouyang, W., et al.: Hybrid task cascade for instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4974\u20134983 (2019)","DOI":"10.1109\/CVPR.2019.00511"},{"key":"1806_CR35","doi-asserted-by":"publisher","first-page":"261","DOI":"10.1007\/s11263-019-01247-4","volume":"128","author":"L Liu","year":"2020","unstructured":"Liu, L., Ouyang, W., Wang, X., Fieguth, P., Chen, J., Liu, X., Pietik\u00e4inen, M.: Deep learning for generic object detection: A survey. Int. J. Comput. Vision 128, 261\u2013318 (2020)","journal-title":"Int. J. Comput. Vision"},{"key":"1806_CR36","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)"},{"key":"1806_CR37","doi-asserted-by":"crossref","unstructured":"Bansal, A., Sikka, K., Sharma, G., Chellappa, R., Divakaran, A.: Zero-shot object detection. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 384\u2013400 (2018)","DOI":"10.1007\/978-3-030-01246-5_24"},{"key":"1806_CR38","unstructured":"Demirel, B., Cinbis, R.G., Ikizler-Cinbis, N.: Zero-shot object detection by hybrid region embedding. arXiv preprint arXiv:1805.06157 (2018)"},{"key":"1806_CR39","doi-asserted-by":"crossref","unstructured":"Hayat, N., Hayat, M., Rahman, S., Khan, S., Zamir, S.W., Khan, F.S.: Synthesizing the unseen for zero-shot object detection. In: Proceedings of the Asian Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-69535-4_10"},{"key":"1806_CR40","doi-asserted-by":"crossref","unstructured":"Rahman, S., Khan, S., Barnes, N.: Transductive learning for zero-shot object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6082\u20136091 (2019)","DOI":"10.1109\/ICCV.2019.00618"},{"key":"1806_CR41","doi-asserted-by":"crossref","unstructured":"Zhu, P., Wang, H., Saligrama, V.: Don\u2019t even look once: Synthesizing features for zero-shot detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11693\u201311702 (2020)","DOI":"10.1109\/CVPR42600.2020.01171"},{"key":"1806_CR42","doi-asserted-by":"crossref","unstructured":"Zhao, S., Gao, C., Shao, Y., Li, L., Yu, C., Ji, Z., Sang, N.: Gtnet: Generative transfer network for zero-shot object detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 12967\u201312974 (2020)","DOI":"10.1609\/aaai.v34i07.6996"},{"key":"1806_CR43","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K.D., Hu, D.H., Chang, S.-F.: Open-vocabulary object detection using captions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14393\u201314402 (2021)","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"1806_CR44","doi-asserted-by":"crossref","unstructured":"Wang, L., Liu, Y., Du, P., Ding, Z., Liao, Y., Qi, Q., Chen, B., Liu, S.: Object-aware distillation pyramid for open-vocabulary object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11186\u201311196 (2023)","DOI":"10.1109\/CVPR52729.2023.01076"},{"key":"1806_CR45","unstructured":"Kim, W., Son, B., Kim, I.: Vilt: Vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning, pp. 5583\u20135594 (2021). PMLR"},{"key":"1806_CR46","unstructured":"Frome, A., Corrado, G.S., Shlens, J., Bengio, S., Dean, J., Ranzato, M., Mikolov, T.: Devise: A deep visual-semantic embedding model. Advances in neural information processing systems 26 (2013)"},{"key":"1806_CR47","unstructured":"Jayaraman, D., Grauman, K.: Zero-shot recognition with unreliable attributes. Advances in neural information processing systems 27 (2014)"},{"key":"1806_CR48","doi-asserted-by":"crossref","unstructured":"Joulin, A., Van Der\u00a0Maaten, L., Jabri, A., Vasilache, N.: Learning visual features from large weakly supervised data. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part VII 14, pp. 67\u201384 (2016). Springer","DOI":"10.1007\/978-3-319-46478-7_5"},{"key":"1806_CR49","doi-asserted-by":"crossref","unstructured":"Li, A., Jabri, A., Joulin, A., Van Der\u00a0Maaten, L.: Learning visual n-grams from web data. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4183\u20134192 (2017)","DOI":"10.1109\/ICCV.2017.449"},{"key":"1806_CR50","doi-asserted-by":"crossref","unstructured":"Xu, J., De\u00a0Mello, S., Liu, S., Byeon, W., Breuel, T., Kautz, J., Wang, X.: Groupvit: Semantic segmentation emerges from text supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18134\u201318144 (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"1806_CR51","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. arXiv preprint arXiv:2201.03546 (2022)"},{"key":"1806_CR52","doi-asserted-by":"crossref","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from clip. In: European Conference on Computer Vision, pp. 696\u2013712 (2022). Springer","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"1806_CR53","doi-asserted-by":"crossref","unstructured":"Schuster, S., Krishna, R., Chang, A., Fei-Fei, L., Manning, C.D.: Generating semantically precise scene graphs from textual descriptions for improved image retrieval. In: Proceedings of the Fourth Workshop on Vision and Language, pp. 70\u201380 (2015)","DOI":"10.18653\/v1\/W15-2812"},{"key":"1806_CR54","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"1806_CR55","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755 (2014). Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1806_CR56","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: Lvis: A dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"1806_CR57","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L.H., Zhou, L., Dai, X., Yuan, L., Li, Y., et al.: Regionclip: Region-based language-image pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16793\u201316803 (2022)","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"1806_CR58","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"1806_CR59","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)"},{"key":"1806_CR60","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"1806_CR61","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"1806_CR62","first-page":"22682","volume":"34","author":"F Wei","year":"2021","unstructured":"Wei, F., Gao, Y., Wu, Z., Hu, H., Lin, S.: Aligning pretraining for detection via object-level contrastive learning. Adv. Neural. Inf. Process. Syst. 34, 22682\u201322694 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1806_CR63","doi-asserted-by":"crossref","unstructured":"Gao, M., Xing, C., Niebles, J.C., Li, J., Xu, R., Liu, W., Xiong, C.: Open vocabulary object detection with pseudo bounding-box labels. In: European Conference on Computer Vision, pp. 266\u2013282 (2022). Springer","DOI":"10.1007\/978-3-031-20080-9_16"},{"key":"1806_CR64","doi-asserted-by":"crossref","unstructured":"Jeong, J., Park, G., Yoo, J., Jung, H., Kim, H.: Proxydet: Synthesizing proxy novel classes via classwise mixup for open-vocabulary object detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 2462\u20132470 (2024)","DOI":"10.1609\/aaai.v38i3.28022"},{"key":"1806_CR65","doi-asserted-by":"crossref","unstructured":"Kim, J., Cho, E., Kim, S., Kim, H.J.: Retrieval-augmented open-vocabulary object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17427\u201317436 (2024)","DOI":"10.1109\/CVPR52733.2024.01650"},{"key":"1806_CR66","unstructured":"Kuo, W., Cui, Y., Gu, X., Piergiovanni, A., Angelova, A.: F-vlm: Open-vocabulary object detection upon frozen vision and language models. arXiv preprint arXiv:2209.15639 (2022)"},{"key":"1806_CR67","doi-asserted-by":"crossref","unstructured":"Shao, S., Li, Z., Zhang, T., Peng, C., Yu, G., Zhang, X., Li, J., Sun, J.: Objects365: A large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8430\u20138439 (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"issue":"301","key":"1806_CR68","first-page":"1","volume":"23","author":"TT Cai","year":"2022","unstructured":"Cai, T.T., Ma, R.: Theoretical foundations of t-sne for visualizing high-dimensional clustered data. J. Mach. Learn. Res. 23(301), 1\u201354 (2022)","journal-title":"J. Mach. Learn. Res."},{"key":"1806_CR69","doi-asserted-by":"crossref","unstructured":"Feng, C., Zhong, Y., Jie, Z., Chu, X., Ren, H., Wei, X., Xie, W., Ma, L.: Promptdet: Towards open-vocabulary detection using uncurated images. In: European Conference on Computer Vision, pp. 701\u2013717 (2022). Springer","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"1806_CR70","doi-asserted-by":"crossref","unstructured":"Chattopadhay, A., Sarkar, A., Howlader, P., Balasubramanian, V.N.: Grad-cam++: Generalized gradient-based visual explanations for deep convolutional networks. In: 2018 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 839\u2013847 (2018). IEEE","DOI":"10.1109\/WACV.2018.00097"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01806-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01806-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01806-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T15:03:32Z","timestamp":1756998212000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01806-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,6]]},"references-count":70,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["1806"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01806-5","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,5,6]]},"assertion":[{"value":"20 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 April 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"The data used in this study comes from publicly available datasets, and therefore, no additional ethical approval or informed consent from participants is required.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and Informed Consent for Data Used"}}],"article-number":"232"}}