{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:10:49Z","timestamp":1757617849618,"version":"3.44.0"},"reference-count":93,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T00:00:00Z","timestamp":1746230400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T00:00:00Z","timestamp":1746230400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s11263-025-02441-3","type":"journal-article","created":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T07:23:40Z","timestamp":1746257020000},"page":"5544-5568","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["P2Object: Single Point Supervised Object Detection and Instance Segmentation"],"prefix":"10.1007","volume":"133","author":[{"given":"Pengfei","family":"Chen","sequence":"first","affiliation":[]},{"given":"Xuehui","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Xumeng","family":"Han","sequence":"additional","affiliation":[]},{"given":"Kuiran","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Guorong","family":"Li","sequence":"additional","affiliation":[]},{"given":"Lingxi","family":"Xie","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9970-5152","authenticated-orcid":false,"given":"Zhenjun","family":"Han","sequence":"additional","affiliation":[]},{"given":"Jianbin","family":"Jiao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,3]]},"reference":[{"key":"2441_CR1","doi-asserted-by":"crossref","unstructured":"Arbel\u00e1ez, P. A., Pont-Tuset, J., Barron, J. T., Marques, F., & Malik, J. (2014). Multiscale combinatorial grouping. In CVPR.","DOI":"10.1109\/CVPR.2014.49"},{"key":"2441_CR2","doi-asserted-by":"crossref","unstructured":"Bilen, H., & Vedaldi, A. (2016). Weakly supervised deep detection networks. In CVPR.","DOI":"10.1109\/CVPR.2016.311"},{"key":"2441_CR3","doi-asserted-by":"crossref","unstructured":"Bottou, L. (2012). Stochastic gradient descent tricks. In Neural networks: Tricks of the trade (2nd ed.). Springer.","DOI":"10.1007\/978-3-642-35289-8_25"},{"key":"2441_CR4","unstructured":"Cao, G., Yu, X., Yu, W., Han, X., Yang, X., Li, G., Jiao, J., & Han, Z. (2023). P2rbox: A single point is all you need for oriented object detection. CoRR arxiv:2311.13128"},{"key":"2441_CR5","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In ECCV.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2441_CR6","doi-asserted-by":"crossref","unstructured":"Chen, H., Sun, K., Tian, Z., Shen, C., Huang, Y., & Yan, Y. (2020a). Blendmask: Top-down meets bottom-up for instance segmentation. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00860"},{"key":"2441_CR7","unstructured":"Chen, K., Wang, J., & Pang, J. (2019). MMDetection: Open mmlab detection toolbox and benchmark. arXiv preprint arxiv:1906.07155"},{"key":"2441_CR8","doi-asserted-by":"crossref","unstructured":"Chen, L., Yang, T., Zhang, X., Zhang, W., & Sun, J. (2021). Points as queries: Weakly semi-supervised object detection by points. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00871"},{"key":"2441_CR9","doi-asserted-by":"crossref","unstructured":"Chen, P., Yu, X., Han, X., Hassan, N., Wang, K., Li, J., Zhao, J., Shi, H., Han, Z., & Ye, Q. (2022). Point-to-box network for accurate object detection via single point supervision. In ECCV.","DOI":"10.1007\/978-3-031-20077-9_4"},{"key":"2441_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Z., Fu, Z., Jiang, R., Chen, Y., & Hua, X. S. (2020b). SLV: spatial likelihood voting for weakly supervised object detection. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01301"},{"issue":"10","key":"2441_CR11","doi-asserted-by":"publisher","first-page":"2738","DOI":"10.1007\/s11263-023-01830-w","volume":"131","author":"Z Chen","year":"2023","unstructured":"Chen, Z., Zhang, J., Xu, Y., & Tao, D. (2023). Transformer-based context condensation for boosting feature pyramids in object detection. IJCV, 131(10), 2738\u20132756.","journal-title":"IJCV"},{"key":"2441_CR12","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A. G., Kirillov, A., & Girdhar, R. (2022a). Masked-attention mask transformer for universal image segmentation. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"2441_CR13","doi-asserted-by":"crossref","unstructured":"Cheng, B., Parkhi, O., & Kirillov, A. (2022b). Pointly-supervised instance segmentation. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00264"},{"key":"2441_CR14","doi-asserted-by":"crossref","unstructured":"Cordts, M., Omran, M., Ramos, S., Rehfeld, T., Enzweiler, M., Benenson, R., Franke, U., Roth, S., & Schiele, B. (2016). The cityscapes dataset for semantic urban scene understanding. In CVPR.","DOI":"10.1109\/CVPR.2016.350"},{"key":"2441_CR15","doi-asserted-by":"crossref","unstructured":"Diba, A., Sharma, V., Pazandeh, A., Pirsiavash, H., & Van Gool, L. (2017). Weakly supervised cascaded convolutional networks. In CVPR.","DOI":"10.1109\/CVPR.2017.545"},{"key":"2441_CR16","doi-asserted-by":"crossref","unstructured":"Everingham, M., Gool, L. V., Williams, C. K., Winn, J., & Zisserman, A. (2010). The pascal visual object classes (VOC) challenge. In IJCV.","DOI":"10.1007\/s11263-009-0275-4"},{"issue":"12","key":"2441_CR17","doi-asserted-by":"publisher","first-page":"3252","DOI":"10.1007\/s11263-023-01862-2","volume":"131","author":"J Fan","year":"2023","unstructured":"Fan, J., & Zhang, Z. (2023). Toward practical weakly supervised semantic segmentation via point-level supervision. IJCV, 131(12), 3252\u20133271.","journal-title":"IJCV"},{"key":"2441_CR18","doi-asserted-by":"crossref","unstructured":"Fan, J., Zhang, Z., & Tan, T. (2022). Pointly-supervised panoptic segmentation. In ECCV.","DOI":"10.1007\/978-3-031-20056-4_19"},{"key":"2441_CR19","doi-asserted-by":"crossref","unstructured":"Gao, M., Li, A., Yu, R., Morariu, V. I., & Davis, L. S. (2018). C-WSL: Count-guided weakly supervised localization. In ECCV.","DOI":"10.1007\/978-3-030-01246-5_10"},{"key":"2441_CR20","doi-asserted-by":"crossref","unstructured":"Ge, W., Yang, S., & Yu, Y. (2018). Multi-evidence filtering and fusion for multi-label classification, object detection and semantic segmentation based on weakly supervised learning. In CVPR.","DOI":"10.1109\/CVPR.2018.00139"},{"key":"2441_CR21","doi-asserted-by":"crossref","unstructured":"Girshick, R. B. (2015). Fast R-CNN. In ICCV.","DOI":"10.1109\/ICCV.2015.169"},{"key":"2441_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J., (2016). Deep residual learning for image recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2441_CR23","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask R-CNN. In ICCV.","DOI":"10.1109\/ICCV.2017.322"},{"key":"2441_CR24","unstructured":"Hsu, C., Hsu, K., Tsai, C., Lin, Y., & Chuang, Y. (2019). Weakly supervised instance segmentation using the bounding box tightness prior. In NeurIPS."},{"key":"2441_CR25","unstructured":"Huang, Z., Zou, Y., Kumar, B. V. K., & Huang, D. (2020). Comprehensive attention self-distillation for weakly-supervised object detection. In NeurIPS."},{"key":"2441_CR26","doi-asserted-by":"crossref","unstructured":"Huang, Z., Bao, Y., Dong, B., Zhou, E., & Zuo, W. (2022). W2N: switching from weak supervision to noisy supervision for object detection. In ECCV.","DOI":"10.1007\/978-3-031-20056-4_41"},{"key":"2441_CR27","doi-asserted-by":"crossref","unstructured":"Jia, Q., Wei, S., Ruan, T., Zhao, Y., & Zhao, Y. (2021). Gradingnet: Towards providing reliable supervisions for weakly supervised object detection by grading the box candidates. In AAAI.","DOI":"10.1609\/aaai.v35i2.16261"},{"key":"2441_CR28","unstructured":"Jiang, N., Wang, K., Peng, X., Yu, X., Wang, Q., Xing, J., Li, G., Zhao, J., Guo, G., & Han, Z. (2021). Anti-UAV: A large multi-modal benchmark for UAV tracking. In IEEE TMM."},{"key":"2441_CR29","doi-asserted-by":"crossref","unstructured":"Kim, B., Yoo, Y., Rhee, C., & Kim, J. (2022). Beyond semantic to instance segmentation: Weakly-supervised instance segmentation via semantic knowledge transfer and self-refinement. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00424"},{"key":"2441_CR30","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A. C., Lo, W. Y., & Doll\u00e1r, P. (2023). Segment anything. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2441_CR31","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. In NIPS."},{"key":"2441_CR32","doi-asserted-by":"crossref","unstructured":"Laradji, I. H., Rostamzadeh, N., Pinheiro, P. O., V\u00e1zquez, D., & Schmidt, M. (2020). Proposal-based instance segmentation with point supervision. In ICIP.","DOI":"10.1109\/ICIP40778.2020.9190782"},{"key":"2441_CR33","doi-asserted-by":"crossref","unstructured":"Lee, P., & Byun, H. (2021). Learning action completeness from points for weakly-supervised temporal action localization. In ICCV.","DOI":"10.1109\/ICCV48922.2021.01339"},{"key":"2441_CR34","doi-asserted-by":"crossref","unstructured":"Li, F., Zhang, H., Xu, H., Liu, S., Zhang, L., Ni, L. M., & Shum, H. (2023a). Mask DINO: Towards a unified transformer-based framework for object detection and segmentation. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"2441_CR35","doi-asserted-by":"crossref","unstructured":"Li, W., Yuan, Y., Wang, S., Zhu, J., Li, J., Liu, J., & Zhang, L. (2023b). Point2mask: Point-supervised panoptic segmentation via optimal transport. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00059"},{"key":"2441_CR36","doi-asserted-by":"crossref","unstructured":"Liao, M., Wan, F., Yao, Y., Han, Z., Zou, J., Wang, Y., Feng, B., Yuan, P., & Ye, Q. (2022). End-to-end weakly supervised object detection with sparse proposal evolution. In ECCV.","DOI":"10.1007\/978-3-031-20077-9_13"},{"key":"2441_CR37","doi-asserted-by":"crossref","unstructured":"Liao, M., Guo, Z., Wang, Y., Yuan, P., Feng, B., & Wan, F. (2023). Attentionshift: Iteratively estimated part-based attention map for pointly supervised instance segmentation. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01870"},{"key":"2441_CR38","doi-asserted-by":"crossref","unstructured":"Lin, T., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., & Belongie, S. (2017a). Feature pyramid networks for object detection. In CVPR.","DOI":"10.1109\/CVPR.2017.106"},{"key":"2441_CR39","doi-asserted-by":"crossref","unstructured":"Lin, T., Goyal, P., Girshick, R., He, K., & Doll\u00e1r, P. (2017b). Focal loss for dense object detection. In ICCV.","DOI":"10.1109\/ICCV.2017.324"},{"key":"2441_CR40","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., & Maire, M. E. (2014). Microsoft coco: Common objects in context. In ECCV.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2441_CR41","doi-asserted-by":"crossref","unstructured":"Liu, S., Zeng, Z., Ren, T., Li, F., Zhang, H., Yang, J., Jiang, Q., Li, C., Yang, J., Su, H., Zhu, J., & Zhang, L. (2024). Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection. In ECCV.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"2441_CR42","doi-asserted-by":"crossref","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C. Y., & Berg, A. C. (2016). SSD: single shot multibox detector. In ECCV.","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"2441_CR43","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2441_CR44","doi-asserted-by":"crossref","unstructured":"Luo, J., Yang, X., Yu, Y., Li, Q., Yan, J., & Li, Y. (2024). Pointobb: Learning oriented object detection via single point supervision. In CVPR.","DOI":"10.1109\/CVPR52733.2024.01583"},{"key":"2441_CR45","doi-asserted-by":"crossref","unstructured":"Papadopoulos, D. P., Uijlings, J. R. R., Keller, F., & Ferrari, V. (2017a). Training object class detectors with click supervision. In CVPR.","DOI":"10.1109\/CVPR.2017.27"},{"key":"2441_CR46","doi-asserted-by":"crossref","unstructured":"Papadopoulos, D. P., Uijlings, J. R. R., Keller, F., & Ferrari, V. (2017b). Extreme clicking for efficient object annotation. In ICCV.","DOI":"10.1109\/ICCV.2017.528"},{"key":"2441_CR47","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. In Meila, M., & Zhang, T. (Eds.) ICML."},{"key":"2441_CR48","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S. K., Girshick, R., & Farhadi, A. (2016). You only look once: Unified, real-time object detection. In CVPR.","DOI":"10.1109\/CVPR.2016.91"},{"key":"2441_CR49","unstructured":"Ren, B., Yang, X., Yu, Y., Luo, J., & Deng, Z. (2024). Pointobb-v2: Towards simpler, faster, and stronger single point supervised oriented object detection. CoRR arxiv:2410.08210"},{"key":"2441_CR50","doi-asserted-by":"crossref","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2017). Faster R-CNN: Towards real-time object detection with region proposal networks. In IEEE TPAMI.","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"2441_CR51","doi-asserted-by":"crossref","unstructured":"Ren, Z., Yu, Z., Yang, X., Liu, M. Y., Lee, Y. J., Schwing, A. G., & Kautz, J. (2020a). Instance-aware, context-focused, and memory-efficient weakly supervised object detection. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01061"},{"key":"2441_CR52","doi-asserted-by":"crossref","unstructured":"Ren, Z., Yu, Z., Yang, X., Liu, M. Y., Schwing, A. G., & Kautz, J. (2020b). $$\\text{Ufo}^{\\text{2 }}$$: A unified framework towards omni-supervised object detection. In ECCV.","DOI":"10.1007\/978-3-030-58529-7_18"},{"key":"2441_CR53","doi-asserted-by":"crossref","unstructured":"Ribera, J., Guera, D., Chen, Y., & Delp, E. J. (2019). Locating objects without bounding boxes. In CVPR.","DOI":"10.1109\/CVPR.2019.00664"},{"key":"2441_CR54","doi-asserted-by":"crossref","unstructured":"van\u00a0de Sande, K. E. A., Uijlings, J. R. R., Gevers, T., & Smeulders, A. W. (2011). Segmentation as selective search for object recognition. In ICCV.","DOI":"10.1109\/ICCV.2011.6126456"},{"key":"2441_CR55","doi-asserted-by":"crossref","unstructured":"Seo, J., Bae, W., Sutherland, D. J., Noh, J., & Kim, D. (2022). Object discovery via contrastive learning for weakly supervised object detection. In ECCV.","DOI":"10.1007\/978-3-031-19821-2_18"},{"key":"2441_CR56","doi-asserted-by":"crossref","unstructured":"Shen, Y., Ji, R., Chen, Z., Wu, Y., & Huang, F. (2020). UWSOD: Toward fully-supervised-level capacity weakly supervised object detection. In NeurIPS.","DOI":"10.1109\/CVPR42600.2020.01134"},{"key":"2441_CR57","doi-asserted-by":"crossref","unstructured":"Shen, Y., Cao, L., Chen, Z., Zhang, B., Su, C., Wu, Y., Huang, F., & Ji, R. (2021). Parallel detection-and-segmentation learning for weakly supervised instance segmentation. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00809"},{"key":"2441_CR58","unstructured":"Simonyan, K., & Zisserman, A. (2015). Very deep convolutional networks for large-scale image recognition. In ICLR."},{"key":"2441_CR59","doi-asserted-by":"crossref","unstructured":"Song, Q., Wang, C., Jiang, Z., Wang, Y., Tai, Y., Wang, C., Li, J., Huang, F., & Wu, Y. (2021). Rethinking counting and localization in crowds: A purely point-based framework. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00335"},{"key":"2441_CR60","doi-asserted-by":"crossref","unstructured":"Sun, P., Zhang, R., Jiang, Y., Kong, T., Xu, C., Zhan, W., Tomizuka, M., Li, L., Yuan, Z., Wang, C., & Luo, P. (2021). Sparse R-CNN: End-to-end object detection with learnable proposals. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"2441_CR61","doi-asserted-by":"crossref","unstructured":"Tang, C., Xie, L., Zhang, G., Zhang, X., Tian, Q., & Hu, X. (2022). Active pointly-supervised instance segmentation. In ECCV.","DOI":"10.1007\/978-3-031-19815-1_35"},{"key":"2441_CR62","doi-asserted-by":"crossref","unstructured":"Tang, P., Bai, X., & Liu, W. (2017). Multiple instance detection network with online instance classifier refinement. In CVPR.","DOI":"10.1109\/CVPR.2017.326"},{"key":"2441_CR63","doi-asserted-by":"crossref","unstructured":"Tang, P., Wang, X., Bai, S., Shen, W., Bai, X., Liu, W., & Yuille, A. (2020). PCL: Proposal cluster learning for weakly supervised object detection. In IEEE TPAMI.","DOI":"10.1109\/TPAMI.2018.2876304"},{"key":"2441_CR64","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C., Chen, H., & He, T. (2019). FCOS: Fully convolutional one-stage object detection. In ICCV.","DOI":"10.1109\/ICCV.2019.00972"},{"key":"2441_CR65","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C., Wang, X., & Chen, H. (2021). Boxinst: High-performance instance segmentation with box annotations. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00540"},{"key":"2441_CR66","doi-asserted-by":"crossref","unstructured":"Tian, Z., Zhang, B., Chen, H., & Shen, C. (2023). Instance and panoptic segmentation using conditional convolutions. In IEEE TPAMI.","DOI":"10.1109\/TPAMI.2022.3145407"},{"issue":"2","key":"2441_CR67","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/s11263-013-0620-5","volume":"104","author":"JRR Uijlings","year":"2013","unstructured":"Uijlings, J. R. R., van de Sande, K. E. A., Gevers, T., & Smeulders, A. W. M. (2013). Selective search for object recognition. IJCV, 104(2), 154\u2013171.","journal-title":"IJCV"},{"key":"2441_CR68","doi-asserted-by":"crossref","unstructured":"Wan, F., Wei, P., Jiao, J., Han, Z., & Ye, Q. (2019). Min-entropy latent model for weakly supervised object detection. In IEEE TPAMI.","DOI":"10.1109\/CVPR.2018.00141"},{"key":"2441_CR69","unstructured":"Wang, W., Liang, J., & Liu, D. (2022). Learning equivariant segmentation with instance-unique querying. In NeurIPS."},{"key":"2441_CR70","doi-asserted-by":"crossref","unstructured":"Wang, X., Kong, T., Shen, C., Jiang, Y., & Li, L. (2020a). SOLO: Segmenting objects by locations. In ECCV.","DOI":"10.1007\/978-3-030-58523-5_38"},{"key":"2441_CR71","unstructured":"Wang, X., Zhang, R., Kong, T., Li, L., & Shen, C. (2020b). Solov2: Dynamic and fast instance segmentation. In NeurIPS."},{"key":"2441_CR72","unstructured":"Wang, Y., He, C., & Chen, X. (2023). Point-to-rbox network for oriented object detection via single point supervision. In BMVC."},{"key":"2441_CR73","doi-asserted-by":"crossref","unstructured":"Wei, Z., Chen, P., Yu, X., Li, G., Jiao, J., & Han, Z. (2024). Semantic-aware SAM for point-prompted instance segmentation. In CVPR.","DOI":"10.1109\/CVPR52733.2024.00344"},{"key":"2441_CR74","doi-asserted-by":"crossref","unstructured":"Wu, D., Chen, P., Yu, X., Li, G., Han, Z., & Jiao, J. (2023). Spatial self-distillation for object detection with inaccurate bounding boxes. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00631"},{"key":"2441_CR75","doi-asserted-by":"crossref","unstructured":"Yan, G., Liu, B., Guo, N., Ye, X., Wan, F., You, H., & Fan, D. (2019). C-MIDN: Coupled multiple instance detection network with segmentation guidance for weakly supervised object detection. In ICCV.","DOI":"10.1109\/ICCV.2019.00993"},{"key":"2441_CR76","doi-asserted-by":"crossref","unstructured":"Yang, Z., Liu, S., Hu, H., Wang, L., & Lin, S. (2019). Reppoints: Point set representation for object detection. In ICCV.","DOI":"10.1109\/ICCV.2019.00975"},{"key":"2441_CR77","unstructured":"Yi, Y., Yang, X., Li, Q., Da, F., Yan, J., Dai, J., & Qiao, Y. (2024). Point2rbox: Combine knowledge from synthetic visual patterns for end-to-end oriented object detection with single point supervision. In CVPR."},{"key":"2441_CR78","doi-asserted-by":"crossref","unstructured":"Yu, X., Chen, P., Wu, D., Hassan, N., Li, G., Yan, J., Shi, H., Ye, Q., & Han, Z., (2022). Object localization under single coarse point supervision. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00482"},{"issue":"7","key":"2441_CR79","doi-asserted-by":"publisher","first-page":"4908","DOI":"10.1109\/TPAMI.2024.3361472","volume":"46","author":"X Yu","year":"2024","unstructured":"Yu, X., Chen, P., Wang, K., Han, X., Li, G., Han, Z., Ye, Q., & Jiao, J. (2024). CPR++: Object localization via single coarse point supervision. IEEE Transactions on Pattern Analysis and Machine Intelligence, 46(7), 4908\u20134925.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2441_CR80","doi-asserted-by":"crossref","unstructured":"Yu, X., Gong, Y., Jiang, N., Ye, Q., & Han, Z. (2020). Scale match for tiny person detection. In IEEE WACV.","DOI":"10.1109\/WACV45572.2020.9093394"},{"key":"2441_CR81","doi-asserted-by":"crossref","unstructured":"Zeng, Z., Liu, B., Fu, J., Chao, H., & Zhang, L. (2019). WSOD2: Learning bottom-up and top-down objectness distillation for weakly-supervised object detection. In ICCV.","DOI":"10.1109\/ICCV.2019.00838"},{"key":"2441_CR82","doi-asserted-by":"crossref","unstructured":"Zhang, D., Han, J., Cheng, G., & Yang, M. (2021). Weakly supervised object localization and detection: A survey. In IEEE TPAMI.","DOI":"10.1109\/TPAMI.2021.3074313"},{"key":"2441_CR83","unstructured":"Zhang, H., Li, F., Liu, S., Zhang, L., Su, H., Zhu, J., Ni, L. M., & Shum, H. (2023a). DINO: DETR with improved denoising anchor boxes for end-to-end object detection. In ICLR"},{"key":"2441_CR84","doi-asserted-by":"crossref","unstructured":"Zhang, R., Tian, Z., Shen, C., You, M., & Yan, Y. (2020). Mask encoding for single shot instance segmentation. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01024"},{"key":"2441_CR85","doi-asserted-by":"crossref","unstructured":"Zhang, S., Yu, Z., Liu, L., Wang, X., Zhou, A., & Chen, K. (2022). Group R-CNN for weakly semi-supervised object detection with points. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00920"},{"key":"2441_CR86","unstructured":"Zhang, X., Liu, F., Peng, Z., Guo, Z., Wan, F., Ji, X., & Ye, Q. (2023b). Integral migrating pre-trained transformer encoder-decoders for visual object detection. In ICCV."},{"key":"2441_CR87","doi-asserted-by":"crossref","unstructured":"Zhang, X., Wei, Y., Feng, J., Yang, Y., & Huang, T. S. (2018). Adversarial complementary learning for weakly supervised object localization. In CVPR.","DOI":"10.1109\/CVPR.2018.00144"},{"key":"2441_CR88","unstructured":"Zhao, J., Wang, G., Li, J., Jin, L., Fan, N., Wang, M., Wang, X., Yong, T., Deng, Y., Guo, Y., Ge, S., & Guo, G. (2021). The 2nd anti-UAV workshop & challenge: Methods and results. In ICCVW 2021."},{"key":"2441_CR89","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., & Torralba, A. (2016). Learning deep features for discriminative localization. In CVPR.","DOI":"10.1109\/CVPR.2016.319"},{"key":"2441_CR90","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Chen, H., Xu, J., Dou, Q., & Heng, P. (2019). Irnet: Instance relation network for overlapping cervical cell segmentation. In MICCAI.","DOI":"10.1007\/978-3-030-32239-7_71"},{"key":"2441_CR91","unstructured":"Zhu, H., Xu, C., & Zhang, R. E. (2024). Tiny object detection with single point supervision. arXiv preprint arxiv:2412.05837"},{"key":"2441_CR92","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., & Dai, J. (2021). Deformable DETR: Deformable transformers for end-to-end object detection. In ICLR."},{"key":"2441_CR93","doi-asserted-by":"crossref","unstructured":"Zitnick, C. L., & Doll\u00e1r, P. (2014). Edge boxes: Locating object proposals from edges. In ECCV.","DOI":"10.1007\/978-3-319-10602-1_26"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02441-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02441-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02441-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T13:01:19Z","timestamp":1757163679000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02441-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,3]]},"references-count":93,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["2441"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02441-3","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,5,3]]},"assertion":[{"value":"23 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}