{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T01:22:25Z","timestamp":1774920145730,"version":"3.50.1"},"reference-count":135,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s11263-024-02167-8","type":"journal-article","created":{"date-parts":[[2024,8,22]],"date-time":"2024-08-22T06:32:36Z","timestamp":1724308356000},"page":"781-808","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Unsupervised Object Localization in the Era of Self-Supervised ViTs: A Survey"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3232-8978","authenticated-orcid":false,"given":"Oriane","family":"Sim\u00e9oni","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"\u00c9loi","family":"Zablocki","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Spyros","family":"Gidaris","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gilles","family":"Puy","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Patrick","family":"P\u00e9rez","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,20]]},"reference":[{"key":"2167_CR1","doi-asserted-by":"crossref","unstructured":"Aflalo, A., Bagon, S., Kashti, T., & Eldar, Y.\u00a0C. (2022). Deepcut: Unsupervised segmentation using graph neural networks clustering. CoRR. arXiv:2212.05853","DOI":"10.1109\/ICCVW60793.2023.00010"},{"key":"2167_CR2","unstructured":"Amir, S., Gandelsman, Y., Bagon, S., & Dekel, T. (2021). Deep vit features as dense visual descriptors. ECCVW What is Motion For?."},{"key":"2167_CR3","doi-asserted-by":"crossref","unstructured":"Amjoud, A.B., & Amrouch, M. (2023). Object detection using deep learning, cnns and vision transformers: A review. IEEE Access.","DOI":"10.1109\/ACCESS.2023.3266093"},{"key":"2167_CR4","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., & Zisserman, A. (2018). Objects that sound. In ECCV.","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"2167_CR5","unstructured":"Arandjelovic, R., & Zisserman, A. (2019). Object discovery with a copy-pasting GAN. CoRR. arXiv:1905.11369."},{"key":"2167_CR6","doi-asserted-by":"crossref","unstructured":"Assran, M., Caron, M., Misra, I., Bojanowski, P., Bordes, F., Vincent, P., Joulin, A., Rabbat, M., & Ballas, N. (2022). Masked siamese networks for label-efficient learning. In ECCV.","DOI":"10.1007\/978-3-031-19821-2_26"},{"key":"2167_CR7","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., & Torralba, A. (2016). Soundnet: Learning sound representations from unlabeled video. In NeurIPS.","DOI":"10.1109\/CVPR.2016.18"},{"key":"2167_CR8","doi-asserted-by":"crossref","unstructured":"Bafghi, R.A., & Gurari, D. (2023). A new dataset based on images taken by blind people for testing the robustness of image classification models trained for imagenet categories. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01560"},{"key":"2167_CR9","doi-asserted-by":"crossref","unstructured":"Bao, Z., Tokmakov, P., Wang, Y., Gaidon, A., & Hebert, M. (2023). Object discovery from motion-guided tokens. In CVPR.","DOI":"10.1109\/CVPR52729.2023.02200"},{"key":"2167_CR10","doi-asserted-by":"crossref","unstructured":"Barron, J.\u00a0T., & Poole, B. (2016). The fast bilateral solver. In ECCV.","DOI":"10.1007\/978-3-319-46487-9_38"},{"key":"2167_CR11","unstructured":"Bielski, A., & Favaro, P. (2019). Emergence of object segmentation in perturbed generative models. In NeurIPS."},{"key":"2167_CR12","unstructured":"Bielski, A., & Favaro, P. (2022). MOVE: unsupervised movable object segmentation and detection. In NeurIPS."},{"key":"2167_CR13","doi-asserted-by":"crossref","unstructured":"Cai, Z., & Vasconcelos, N. (2018). Cascade r-cnn: Delving into high quality object detection. In CVPR.","DOI":"10.1109\/CVPR.2018.00644"},{"key":"2167_CR14","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In ECCV.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2167_CR15","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., & Joulin, A. (2020). Unsupervised learning of visual features by contrasting cluster assignments. In NeurIPS."},{"key":"2167_CR16","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., & Joulin, A. (2021). Emerging properties in self-supervised vision transformers. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"2167_CR17","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., & Zisserman, A. (2021a) Localizing visual sounds the hard way. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"2167_CR18","doi-asserted-by":"crossref","unstructured":"Chen, L., Papandreou, G., Kokkinos, I., Murphy, K., & Yuille, A.\u00a0L. (2018). Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, & fully connected crfs. IEEE TPAMI.","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"2167_CR19","unstructured":"Chen, T., Kornblith, S., Norouzi, M., & Hinton, G.\u00a0E. (2020a). A simple framework for contrastive learning of visual representations. In ICML."},{"key":"2167_CR20","doi-asserted-by":"crossref","unstructured":"Chen, X., & He, K. (2021). Exploring simple siamese representation learning. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"2167_CR21","unstructured":"Chen, X., Fan, H., Girshick, R.\u00a0B., & He, K. (2020b). Improved baselines with momentum contrastive learning. CoRR. arXiv:2003.04297."},{"key":"2167_CR22","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., & He, K. (2021b). An empirical study of training self-supervised vision transformers. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"2167_CR23","doi-asserted-by":"crossref","unstructured":"Chen, Y., Li, W., Chen, X., & Gool, L.\u00a0V. (2019). Learning semantic segmentation from synthetic data: A geometrically guided input-output adaptation approach. In CVPR.","DOI":"10.1109\/CVPR.2019.00194"},{"key":"2167_CR24","unstructured":"Cheng, B., Schwing, A.\u00a0G., & Kirillov, A. (2021). Per-pixel classification is not all you need for semantic segmentation. In NeurIPS."},{"key":"2167_CR25","unstructured":"Cho, J.\u00a0H., Mall, U., Bala, K., & Hariharan, B. (2021). PiCIE: Unsupervised semantic segmentation using invariance and equivariance in clustering. In CVPR."},{"key":"2167_CR26","doi-asserted-by":"crossref","unstructured":"Choudhuri, S., Das, N., Sarkhel, R., & Nasipuri, M. (2018). Object localization on natural scenes: A survey. PR.","DOI":"10.1142\/S0218001418550017"},{"key":"2167_CR27","unstructured":"Choudhury, S., Karazija, L., Laina, I., Vedaldi, A., & Rupprecht, C. (2022). Guess what moves: Unsupervised video and image segmentation by anticipating motion. In BMVC."},{"key":"2167_CR28","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In CVPR.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2167_CR29","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., & Houlsby, N. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In ICLR."},{"key":"2167_CR30","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., & Ommer, B. (2021). Taming transformers for high-resolution image synthesis. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"2167_CR31","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C.\u00a0K.\u00a0I., Winn, J., & Zisserman, A. (2007). The PASCAL Visual Object Classes Challenge 2007 (VOC2007) Results a."},{"key":"2167_CR32","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C.\u00a0K.\u00a0I., Winn, J., & Zisserman, A. (2012). The PASCAL Visual Object Classes Challenge 2012 (VOC2012) Results, b."},{"key":"2167_CR33","unstructured":"Gansbeke, W.\u00a0V., Vandenhende, S., & Gool, L.\u00a0V. (2022). Discovering object masks with transformers for unsupervised semantic segmentation. CoRR. arXiv:2206.06363."},{"key":"2167_CR34","doi-asserted-by":"crossref","unstructured":"Gomel, E., Shaharbany, T., & Wolf, L. (2023). Box-based refinement for weakly supervised and unsupervised localization tasks. In ICCV.","DOI":"10.1109\/ICCV51070.2023.01470"},{"key":"2167_CR35","unstructured":"Grill, J.-B., Strub, F., Altch\u00e9, F., Tallec, C., Richemond, P.\u00a0H., Buchatskaya, E., Doersch, C., Pires, B.\u00a0A., Guo, Z.\u00a0D., Azar, M.\u00a0G. et\u00a0al. (2020). Bootstrap your own latent: A new approach to self-supervised learning. In NeurIPS."},{"key":"2167_CR36","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., & Girshick, R. (2019). LVIS: A dataset for large vocabulary instance segmentation. In CVPR.","DOI":"10.1109\/CVPR.2019.00550"},{"key":"2167_CR37","unstructured":"Hamilton, M., Zhang, Z., Hariharan, B., Snavely, N., & Freeman, W.\u00a0T. (2022). Unsupervised semantic segmentation by distilling feature correspondences. In ICLR."},{"key":"2167_CR38","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep Residual Learning for Image Recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2167_CR39","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask r-cnn. In CVPR.","DOI":"10.1109\/ICCV.2017.322"},{"key":"2167_CR40","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., & Girshick, R.\u00a0B. (2022). Masked autoencoders are scalable vision learners. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2167_CR41","doi-asserted-by":"crossref","unstructured":"H\u00e9naff, O.\u00a0J., Koppula, S., Shelhamer, E., Zoran, D., Jaegle, A., Zisserman, A., Carreira, J., & Arandjelovic, R. (2022). Object discovery and representation networks. In ECCV.","DOI":"10.1007\/978-3-031-19812-0_8"},{"key":"2167_CR42","doi-asserted-by":"crossref","unstructured":"Hoyer, L., Dai, D., Chen, Y., K\u00f6ring, A., Saha, S., & Gool, L.\u00a0V. (2021). Three ways to improve semantic segmentation with self-supervised depth estimation. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01098"},{"key":"2167_CR43","doi-asserted-by":"crossref","unstructured":"Hoyer, L., Dai, D., Wang, Q., Chen, Y., & Gool, L.\u00a0V. (2023). Improving semi-supervised and domain-adaptive semantic segmentation with self-supervised depth estimation. IJCV.","DOI":"10.1007\/s11263-023-01799-6"},{"key":"2167_CR44","doi-asserted-by":"crossref","unstructured":"Ishtiak, T., En, Q., & Guo, Y. (2023). Exemplar-freesolo: Enhancing unsupervised instance segmentation with exemplars. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01480"},{"key":"2167_CR45","doi-asserted-by":"crossref","unstructured":"Ji, X., Henriques, J.\u00a0F., & Vedaldi, A. (2019). Invariant information clustering for unsupervised image classification and segmentation. In ICCV.","DOI":"10.1109\/ICCV.2019.00996"},{"key":"2167_CR46","unstructured":"Jiang, J., Deng, F., Singh, G., & Ahn, S. (2023). Object-centric slot diffusion. arXiv preprint[SPACE]arXiv:2303.10834."},{"key":"2167_CR47","doi-asserted-by":"crossref","unstructured":"Kara, S., Ammar, H., Chabot, F., & Pham, Q.\u00a0C. (2023). Image segmentation-based unsupervised multiple objects discovery. In WACV.","DOI":"10.1109\/WACV56688.2023.00329"},{"key":"2167_CR48","unstructured":"Karazija, L., Choudhury, S., Laina, I., Rupprecht, C., & Vedaldi, A. (2022). Unsupervised multi-object segmentation by predicting probable motion patterns. In NeurIPS."},{"issue":"12","key":"2167_CR49","doi-asserted-by":"publisher","first-page":"9574","DOI":"10.1109\/TPAMI.2021.3123902","volume":"44","author":"I Katircioglu","year":"2021","unstructured":"Katircioglu, I., Rhodin, H., Constantin, V., Sp\u00f6rri, J., Salzmann, M., & Fua, P. (2021). Self-supervised human detection and segmentation via background inpainting. IEEE TPAMI, 44(12), 9574\u20139588.","journal-title":"IEEE TPAMI"},{"key":"2167_CR50","doi-asserted-by":"crossref","unstructured":"Kidron, E., Schechner, Y.\u00a0Y., & Elad, M. (2005). Pixels that sound. In CVPR.","DOI":"10.1109\/CVPR.2005.274"},{"key":"2167_CR51","unstructured":"Kim, G., & Torralba, A. (2009). Unsupervised detection of regions of interest using iterative link analysis. In NeurIPS."},{"key":"2167_CR52","doi-asserted-by":"crossref","unstructured":"Kim, W., Kanezaki, A., & Tanaka, M. (2020). Unsupervised learning of image segmentation based on differentiable feature clustering. NeurIPS.","DOI":"10.1109\/TIP.2020.3011269"},{"key":"2167_CR53","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A.\u00a0C., Lo, W.-Y., et\u00a0al. (2023). Segment anything. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2167_CR54","unstructured":"Kr\u00e4henb\u00fchl, P., & Koltun, V. (2011). Efficient inference in fully connected crfs with gaussian edge potentials. In NeurIPS."},{"key":"2167_CR55","doi-asserted-by":"crossref","unstructured":"Kuhn, H.\u00a0W. (1955). The hungarian method for the assignment problem. Naval research logistics quarterly.","DOI":"10.1002\/nav.3800020109"},{"key":"2167_CR56","unstructured":"Lao, D., Hu, Z., Locatello, F., Yang, Y., & Soatto, S. (2023). Divided attention: Unsupervised multi-object discovery with contextually separated slots. CoRR. arXiv:2304.01430."},{"key":"2167_CR57","unstructured":"Li, C., Yang, J., Zhang, P., Gao, M., Xiao, B., Dai, X., Yuan, L., & Gao, J. (2022a). Efficient self-supervised vision transformers for representation learning. In ICLR."},{"key":"2167_CR58","unstructured":"Li, J., Li, D., Xiong, C., & Hoi, S. (2022b). Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML."},{"key":"2167_CR59","doi-asserted-by":"crossref","unstructured":"Li, N., Sun, B., & Yu, J. (2015). A weighted sparse coding framework for saliency detection. In CVPR.","DOI":"10.1109\/CVPR.2015.7299158"},{"key":"2167_CR60","unstructured":"Li, X., Lin, C., Chen, Y., Liu, Z., Wang, J., & Raj, B. (2023). Paintseg: Training-free segmentation via painting. In NeurIPS."},{"key":"2167_CR61","unstructured":"Lim, S., Park, J., Lee, M., & Lee. H. (2022). K-means for unsupervised instance segmentation using a self-supervised transformer. Available at SSRN 4251338."},{"key":"2167_CR62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48","author":"T Lin","year":"2014","unstructured":"Lin, T., Maire, M., Belongie, S. J., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft COCO: Common objects in context. In ECCV. https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","journal-title":"In ECCV"},{"key":"2167_CR63","unstructured":"Locatello, F., Weissenborn, D., Unterthiner, T., Mahendran, A., Heigold, G., Uszkoreit, J., Dosovitskiy, A., & Kipf, T. (2020). Object-centric learning with slot attention. In NeurIPS."},{"key":"2167_CR64","doi-asserted-by":"crossref","unstructured":"Lv, Y., Zhang, J., Barnes, N., & Dai, Y. (2023). Weakly-supervised contrastive learning for unsupervised object discovery. CoRR. arXiv:2307.03376.","DOI":"10.1109\/TIP.2024.3380243"},{"key":"2167_CR65","unstructured":"Ma, C., Yang, Y., Ju, C., Zhang, F., Liu, J., Wang, Y., Zhang, Y., & Wang, Y. (2023). Diffusionseg: Adapting diffusion towards unsupervised object discovery. arXiv preprint[SPACE]arXiv:2303.09813."},{"key":"2167_CR66","unstructured":"Melas-Kyriazi, L., Rupprecht, C., Laina, I., & Vedaldi, A. (2021). Finding an unsupervised image segmenter in each of your deep generative models. CoRR. arXiv:2105.08127."},{"key":"2167_CR67","doi-asserted-by":"crossref","unstructured":"Melas-Kyriazi, L., Rupprecht, C., Laina, I., & Vedaldi, A. (2022a) Deep spectral methods: A surprisingly strong baseline for unsupervised semantic segmentation and localization. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00818"},{"key":"2167_CR68","unstructured":"Melas-Kyriazi, L., Rupprecht, C., Laina, I., & Vedaldi, A. (2022b). Finding an unsupervised image segmenter in each of your deep generative models. In ICLR."},{"key":"2167_CR69","unstructured":"Nguyen, D.\u00a0T., Dax, M., Mummadi, C.\u00a0K., Ngo, T., Nguyen, T.\u00a0H.\u00a0P., Lou, Z., & Brox, T. (2019). Deepusps: Deep robust unsupervised saliency prediction via self-supervision. In NeurIPS."},{"key":"2167_CR70","unstructured":"Oquab, M., Darcet, T., Moutakanni, T., Vo, H., Szafraniec, M., Khalidov, V., Fernandez, P., Haziza, D., Massa, F., El-Nouby, A., Assran, M., Ballas, N., Galuba, W., Howes, R., Huang, P., Li, S., Misra, I., Rabbat, M.\u00a0G., Sharma, V., Synnaeve, G., Xu, H., J\u00e9gou, H., Mairal, J., Labatut, P., Joulin, A., & Bojanowski, P. (2023). Dinov2: Learning robust visual features without supervision. CoRR. arXiv:2304.07193."},{"key":"2167_CR71","unstructured":"Ostyakov, P., Suvorov, R., Logacheva, E., Khomenko, O., & Nikolenko, S.\u00a0I. (2018). SEIGAN: towards compositional image generation by simultaneously learning to segment, enhance, & inpaint. CoRR. arXiv:1811.07630."},{"key":"2167_CR72","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J., Torralba, A., Adelson, E.\u00a0H., & Freeman, W.\u00a0T. (2016). Visually indicated sounds. In CVPR.","DOI":"10.1109\/CVPR.2016.264"},{"key":"2167_CR73","unstructured":"Radford, A., Kim, J.\u00a0W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J. et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, pages 8748\u20138763. PMLR."},{"key":"2167_CR74","doi-asserted-by":"crossref","unstructured":"Rambhatla, S.\u00a0S., Misra, I., Chellappa, R., & Shrivastava, A. (2023). MOST: multiple object localization with self-supervised transformers for object discovery. In ICCV.","DOI":"10.1109\/ICCV51070.2023.01450"},{"key":"2167_CR75","doi-asserted-by":"crossref","unstructured":"Ravindran, S., & Basu, D. (2023). SEMPART: self-supervised multi-resolution partitioning of image semantics. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00073"},{"key":"2167_CR76","doi-asserted-by":"crossref","unstructured":"Remez, T., Huang, J., & Brown, M. (2018). Learning to segment via cut-and-paste. In ECCV.","DOI":"10.1007\/978-3-030-01234-2_3"},{"key":"2167_CR77","unstructured":"Ren, S., He, K., Girshick, R.\u00a0B., & Sun, J. (2015). Faster R-CNN: towards real-time object detection with region proposal networks. In NeurIPS."},{"key":"2167_CR78","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2167_CR79","doi-asserted-by":"crossref","unstructured":"Safadoust, S., & G\u00fcney, F. (2023). Multi-object discovery by low-dimensional object motion. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00074"},{"key":"2167_CR80","doi-asserted-by":"crossref","unstructured":"Schmarje, L., Santarossa, M., Schr\u00f6der, S., & Koch, R. (2021). A survey on semi-, self- and unsupervised learning for image classification. IEEE Access.","DOI":"10.1109\/ACCESS.2021.3084358"},{"key":"2167_CR81","unstructured":"Seitzer, M., Horn, M., Zadaianchuk, A., Zietlow, D., Xiao, T., Simon-Gabriel, C., He, T., Zhang, Z., Sch\u00f6lkopf, B., Brox, T., & Locatello, F. (2023). Bridging the gap to real-world object-centric learning. In ICLR."},{"key":"2167_CR82","doi-asserted-by":"crossref","unstructured":"Shao, F., Chen, L., Shao, J., Ji, W., Xiao, S., Ye, L., Zhuang, Y., & Xiao, J. (2022). Deep learning for weakly-supervised object detection and localization: A survey. Neurocomputing.","DOI":"10.1016\/j.neucom.2022.01.095"},{"key":"2167_CR83","doi-asserted-by":"crossref","unstructured":"Sharma, R., Saqib, M., Lin, C., & Blumenstein, M. (2022). A survey on object instance segmentation. SN Computer Science.","DOI":"10.1007\/s42979-022-01407-3"},{"key":"2167_CR84","unstructured":"Shehzadi, T., Hashmi, K.\u00a0A., Stricker, D., & Afzal, M.\u00a0Z. (2023). Object detection with transformers: A review. CoRR. arXiv:2306.04670."},{"key":"2167_CR85","doi-asserted-by":"crossref","unstructured":"Shi, J., Yan, Q., Xu, L., & Jia, J. (2016). Hierarchical image saliency detection on extended CSSD. IEEE TPAMI.","DOI":"10.1109\/TPAMI.2015.2465960"},{"key":"2167_CR86","doi-asserted-by":"crossref","unstructured":"Shin, G., Albanie, S., & Xie, W. (2022). Unsupervised salient object detection with spectral cluster voting. In CVPRW.","DOI":"10.1109\/CVPRW56347.2022.00442"},{"key":"2167_CR87","doi-asserted-by":"crossref","unstructured":"Shin, G., Xie, W., & Albanie, S. (2023). Namedmask: Distilling segmenters from complementary foundation models. In CVPRW.","DOI":"10.1109\/CVPRW59228.2023.00524"},{"key":"2167_CR88","unstructured":"Sim\u00e9oni, O., Puy, G., Vo, H.\u00a0V., Roburin, S., Gidaris, S., Bursuc, A., P\u00e9rez, P., Marlet, R., & Ponce, J. (2021). Localizing objects with self-supervised transformers and no labels. In BMVC."},{"key":"2167_CR89","doi-asserted-by":"crossref","unstructured":"Sim\u00e9oni, O., Sekkat, C., Puy, G., Vobecky, A., Zablocki, E., & P\u00e9rez, P. (2023). Unsupervised object localization: Observing the background to discover objects. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00310"},{"key":"2167_CR90","doi-asserted-by":"crossref","unstructured":"Song, Y., Jang, S., Katabi, D., & Son, J. (2023). Unsupervised object localization with representer point selection. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00601"},{"key":"2167_CR91","doi-asserted-by":"crossref","unstructured":"Tian, H., Chen, Y., Dai, J., Zhang, Z., & Zhu, X. (2021). Unsupervised object detection with lidar clues. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00590"},{"key":"2167_CR92","unstructured":"Triantafyllos, A., Yuki, M.\u00a0A., Fagan, F., Vedaldi, A., & Metze, F. (2020). Self-supervised object detection from audio-visual correspondence. In ECCV."},{"key":"2167_CR93","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/s11263-013-0620-5","volume":"104","author":"JRR Uijlings","year":"2013","unstructured":"Uijlings, J. R. R., van de Sande, K. E. A., & Gevers, T. (2013). Selective search for object recognition. International Journal of Computer Vision, 104, 154\u2013171.","journal-title":"International Journal of Computer Vision"},{"key":"2167_CR94","unstructured":"van den Oord, A., & Vinyals, O. (2017). Neural discrete representation learning. Advances in neural information processing systems, 30"},{"key":"2167_CR95","doi-asserted-by":"crossref","unstructured":"Vandenhende, S., Georgoulis, S., Gansbeke, W.\u00a0V., Proesmans, M., Dai, D., & Gool, L.\u00a0V. (2022). Multi-task learning for dense prediction tasks: A survey. IEEE TPAMI.","DOI":"10.1109\/TPAMI.2021.3054719"},{"key":"2167_CR96","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.\u00a0N., Kaiser, L., & Polosukhin, I. (2017). Attention is all you need. In NeurIPS."},{"key":"2167_CR97","doi-asserted-by":"crossref","unstructured":"Vo, H.\u00a0V., Bach, F.\u00a0R., Cho, M., Han, K., LeCun, Y., P\u00e9rez, P., & Ponce, J. (2019). Unsupervised image matching and object discovery as optimization. In CVPR.","DOI":"10.1109\/CVPR.2019.00848"},{"key":"2167_CR98","doi-asserted-by":"crossref","unstructured":"Vo, H.\u00a0V., P\u00e9rez, P., & Ponce, J. (2020a). Toward unsupervised, multi-object discovery in large-scale image collections. In ECCV.","DOI":"10.1007\/978-3-030-58592-1_46"},{"key":"2167_CR99","doi-asserted-by":"crossref","unstructured":"Vo, H.\u00a0V., P\u00e9rez, P., & Ponce, J. (2020b). Toward unsupervised, multi-object discovery in large-scale image collections. In ECCV.","DOI":"10.1007\/978-3-030-58592-1_46"},{"key":"2167_CR100","unstructured":"Vo, H.\u00a0V., Sizikova, E., Schmid, C., P\u00e9rez, P., & Ponce, J. (2021a). Large-scale unsupervised object discovery. In NeurIPS."},{"key":"2167_CR101","unstructured":"Vo, V.\u00a0H., Sizikova, E., Schmid, C., P\u00e9rez, P., & Ponce, J. (2021)b. Large-scale unsupervised object discovery. In NeurIPS."},{"key":"2167_CR102","doi-asserted-by":"crossref","unstructured":"Vobecky, A., Hurych, D., Sim\u00e9oni, O., Gidaris, S., Bursuc, A., P\u00e9rez, P., & Sivic, J. (2022). Drive &segment: Unsupervised semantic segmentation of urban scenes via cross-modal distillation. In ECCV.","DOI":"10.1007\/978-3-031-19839-7_28"},{"key":"2167_CR103","unstructured":"Voynov, A., Morozov, S., & Babenko, A. (2021). Object segmentation without labels with large-scale generative models. In ICML."},{"key":"2167_CR104","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., & Belongie, S. (2011). The caltech-ucsd birds-200-2011 dataset."},{"key":"2167_CR105","unstructured":"Wang, J., Li, X., Zhang, J., Xu, Q., Zhou, Q., Yu, Q., Sheng, L., & Xu, D. (2023a). Diffusion model is secretly a training-free open vocabulary semantic segmenter. arXiv preprint[SPACE]arXiv:2309.02773."},{"key":"2167_CR106","doi-asserted-by":"crossref","unstructured":"Wang, L., Lu, H., Wang, Y., Feng, M., Wang, D., Yin, B., & Ruan, X. (2017). Learning to detect salient objects with image-level supervision. In CVPR.","DOI":"10.1109\/CVPR.2017.404"},{"key":"2167_CR107","doi-asserted-by":"crossref","unstructured":"Wang, W., Feiszli, M., Wang, H., & Tran, D. (2021a). Unidentified video objects: A benchmark for dense, open-world segmentation. In ICCV.","DOI":"10.1109\/ICCV48922.2021.01060"},{"key":"2167_CR108","doi-asserted-by":"crossref","unstructured":"Wang, X., Kong, T., Shen, C., Jiang, Y., & Li, L. (2020a). Solo: Segmenting objects by locations. In ECCV.","DOI":"10.1007\/978-3-030-58523-5_38"},{"key":"2167_CR109","unstructured":"Wang, X., Zhang, R., Kong, T., Li, L., & Shen, C. (2020b). Solov2: Dynamic and fast instance segmentation. In NeurIPS."},{"key":"2167_CR110","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, R., Shen, C., Kong, T., & Li, L. (2021b). Dense contrastive learning for self-supervised visual pre-training. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00304"},{"key":"2167_CR111","doi-asserted-by":"crossref","unstructured":"Wang, X., Yu, Z., Mello, S.\u00a0D., Kautz, J., Anandkumar, A., Shen, C., & Alvarez, J.\u00a0M. (2022a). Freesolo: Learning to segment objects without annotations. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01378"},{"key":"2167_CR112","doi-asserted-by":"crossref","unstructured":"Wang, X., Girdhar, R., Yu, S.\u00a0X., & Misra, I. (2023b) Cut and learn for unsupervised object detection and instance segmentation. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00305"},{"key":"2167_CR113","doi-asserted-by":"crossref","unstructured":"Wang, X., Misra, I., Zeng, Z., Girdhar, R., & Darrell, T. (2023c) Videocutler: Surprisingly simple unsupervised video instance segmentation. CoRR. arXiv:2308.14710.","DOI":"10.1109\/CVPR52733.2024.02147"},{"issue":"2\u20133","key":"2167_CR114","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1561\/0600000097","volume":"13","author":"Y Wang","year":"2022","unstructured":"Wang, Y., Ahsan, U., Li, H., & Hagen, M. (2022). A comprehensive review of modern object segmentation approaches. Foundations and Trends in Computer Graphics and Vision, 13(2\u20133), 111\u2013283.","journal-title":"Foundations and Trends in Computer Graphics and Vision"},{"key":"2167_CR115","doi-asserted-by":"crossref","unstructured":"Wang, Y., Shen, X., Hu, S.\u00a0X., Yuan, Y., Crowley, J.\u00a0L., & Vaufreydaz, D. (2022c). Self-supervised transformers for unsupervised object discovery using normalized cut. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01414"},{"key":"2167_CR116","doi-asserted-by":"crossref","unstructured":"Wang, Y., He, X., Peng, S., Lin, H., Bao, H., & Zhou, X. (2023d). Autorecon: Automated 3d object discovery and reconstruction. In CVPR.","DOI":"10.1109\/CVPR52729.2023.02048"},{"key":"2167_CR117","doi-asserted-by":"crossref","unstructured":"Wei, X.-S., Zhang, C.-L., Wu, J., Shen, C., & Zhou, Z.-H. (2019). Unsupervised object discovery and co-localization by deep descriptor transforming. PR.","DOI":"10.1016\/j.patcog.2018.10.022"},{"key":"2167_CR118","unstructured":"Wen, X., Zhao, B., Zheng, A., Zhang, X., & Qi, X. (2022). Self-supervised visual representation learning with semantic grouping. In NeurIPS."},{"key":"2167_CR119","unstructured":"Wu, Z., Hu, J., Lu, W., Gilitschenski, I., & Garg, A. (2024). Slotdiffusion: Object-centric generative modeling with diffusion models. NeurIPS."},{"key":"2167_CR120","doi-asserted-by":"crossref","unstructured":"Wysoczanska, M., Ramamonjisoa, M., Trzcinski, T., & Sim\u00e9oni, O. (2023). Clip-diy: Clip dense inference yields open-vocabulary semantic segmentation for-free.","DOI":"10.1109\/WACV57701.2024.00143"},{"key":"2167_CR121","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, S., Mello, S.\u00a0D., Yu, Z., Kautz, J., & Yang, M. Learning contrastive representation for semantic correspondence. IJCV, (2022).","DOI":"10.1007\/s11263-022-01602-y"},{"key":"2167_CR122","unstructured":"Xie, J., Zhan, X., Liu, Z., Ong, Y.\u00a0S., & Loy, C.\u00a0C. (2021). Unsupervised object-level representation learning from scene images. In NeurIPS."},{"key":"2167_CR123","doi-asserted-by":"crossref","unstructured":"Yan, Q., Xu, L., Shi, J., & Jia, J. (2013). Hierarchical saliency detection. In CVPR.","DOI":"10.1109\/CVPR.2013.153"},{"key":"2167_CR124","doi-asserted-by":"crossref","unstructured":"Yang, C., Zhang, L., Lu, H., Ruan, X., & Yang, M. (2013). Saliency detection via graph-based manifold ranking. In CVPR.","DOI":"10.1109\/CVPR.2013.407"},{"key":"2167_CR125","doi-asserted-by":"crossref","unstructured":"Yang, Y., Loquercio, A., Scaramuzza, D., & Soatto, S. (2019). Unsupervised moving object detection via contextual information separation. In CVPR.","DOI":"10.1109\/CVPR.2019.00097"},{"key":"2167_CR126","unstructured":"Yeh, C.-K., Kim, J., Yen, I.\u00a0E.-H., & Ravikumar, P.\u00a0K. (2018). Representer point selection for explaining deep neural networks. NeurIPS."},{"key":"2167_CR127","doi-asserted-by":"crossref","unstructured":"Zhang, D., Han, J., Cheng, G., & Yang, M. (2022). Weakly supervised object localization and detection: A survey. IEEE TPAMI.","DOI":"10.1109\/TPAMI.2021.3074313"},{"key":"2167_CR128","doi-asserted-by":"crossref","unstructured":"Zhang, R., Huang, Y., Pu, M., Zhang, J., Guan, Q., Zou, Q., & Ling, H. (2020). Object discovery from a single unlabeled image by mining frequent itemsets with multi-scale features. IEEE TIP.","DOI":"10.1109\/TIP.2020.3015543"},{"key":"2167_CR129","doi-asserted-by":"crossref","unstructured":"Zhang, X., & Boularias, A. (2023). Optical flow boosts unsupervised localization and segmentation. In IROS, 2023.","DOI":"10.1109\/IROS55552.2023.10342195"},{"key":"2167_CR130","doi-asserted-by":"crossref","unstructured":"Zhang, Y., & Wu, C. (2023). Unsupervised camouflaged object segmentation as domain adaptation. CoRR. arXiv:2308.04528.","DOI":"10.31219\/osf.io\/3rtgu"},{"key":"2167_CR131","doi-asserted-by":"crossref","unstructured":"Zheng, M., Wang, F., You, S., Qian, C., Zhang, C., Wang, X., & Xu, C. (2021). Weakly supervised contrastive learning. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00989"},{"key":"2167_CR132","unstructured":"Zhou, J., Wei, C., Wang, H., Shen, W., Xie, C., Yuille, A.\u00a0L., & Kong, T. (2022). Image BERT pre-training with online tokenizer. In ICLR."},{"key":"2167_CR133","doi-asserted-by":"crossref","unstructured":"Zhu, W., Liang, S., Wei, Y., & Sun, J. (2014). Saliency optimization from robust background detection. In CVPR.","DOI":"10.1109\/CVPR.2014.360"},{"key":"2167_CR134","doi-asserted-by":"crossref","unstructured":"Ziegler, A., & Asano, Y.\u00a0M. (2022). Self-supervised learning of object parts for semantic segmentation. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01410"},{"key":"2167_CR135","doi-asserted-by":"crossref","unstructured":"Zitnick, L., & Doll\u00e1r, P. (2014). Edge boxes: Locating object proposals from edges. In ECCV.","DOI":"10.1007\/978-3-319-10602-1_26"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02167-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02167-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02167-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T06:43:06Z","timestamp":1737528186000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02167-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,20]]},"references-count":135,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["2167"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02167-8","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,20]]},"assertion":[{"value":"16 October 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 June 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 August 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"This survey was funded by Valeo and no other funding was received to assist with the preparation of this manuscript. The authors have no relevant financial or non-financial interests to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interests"}}]}}