{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T16:59:28Z","timestamp":1780765168530,"version":"3.54.1"},"publisher-location":"Cham","reference-count":87,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730269","type":"print"},{"value":"9783031730276","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T00:00:00Z","timestamp":1732579200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T00:00:00Z","timestamp":1732579200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73027-6_15","type":"book-chapter","created":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T17:56:00Z","timestamp":1732557360000},"page":"253-272","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Do Text-Free Diffusion Models Learn Discriminative Visual Representations?"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-4134-1805","authenticated-orcid":false,"given":"Soumik","family":"Mukhopadhyay","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9826-6285","authenticated-orcid":false,"given":"Matthew","family":"Gwilliam","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0816-2931","authenticated-orcid":false,"given":"Yosuke","family":"Yamaguchi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9470-198X","authenticated-orcid":false,"given":"Vatsal","family":"Agarwal","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0257-8831","authenticated-orcid":false,"given":"Namitha","family":"Padmanabhan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1412-9790","authenticated-orcid":false,"given":"Archana","family":"Swaminathan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5348-0632","authenticated-orcid":false,"given":"Tianyi","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7148-4127","authenticated-orcid":false,"given":"Jun","family":"Ohya","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8928-8554","authenticated-orcid":false,"given":"Abhinav","family":"Shrivastava","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,11,26]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Assran, M., et al.: Masked Siamese networks for label-efficient learning (2022)","DOI":"10.1007\/978-3-031-19821-2_26"},{"key":"15_CR2","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: BEiT: BERT pre-training of image transformers (2022)"},{"key":"15_CR3","unstructured":"Baranchuk, D., Voynov, A., Rubachev, I., Khrulkov, V., Babenko, A.: Label-efficient semantic segmentation with diffusion models. In: International Conference on Learning Representations (2021)"},{"key":"15_CR4","unstructured":"Bardes, A., Ponce, J., LeCun, Y.: VICReg: variance-invariance-covariance regularization for self-supervised learning. arXiv: abs\/2105.04906 (2021)"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Besnier, V., Jain, H., Bursuc, A., Cord, M., P\u00e9rez, P.: This dataset does not exist: training models from generated images. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053146"},{"key":"15_CR6","unstructured":"Brock, A., Donahue, J., Simonyan, K.: Large scale GAN training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096 (2018)"},{"key":"15_CR7","unstructured":"Burgert, R., Ranasinghe, K., Li, X., Ryoo, M.S.: Peekaboo: text to image diffusion models are zero-shot segmentors. arXiv preprint arXiv:2211.13224 (2022)"},{"key":"15_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1007\/978-3-030-01264-9_9","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M Caron","year":"2018","unstructured":"Caron, M., Bojanowski, P., Joulin, A., Douze, M.: Deep clustering for unsupervised learning of visual features. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) Computer Vision \u2013 ECCV 2018. LNCS, vol. 11218, pp. 139\u2013156. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_9"},{"key":"15_CR9","first-page":"9912","volume":"33","author":"M Caron","year":"2020","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. Adv. Neural. Inf. Process. Syst. 33, 9912\u20139924 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Chen, S., Sun, P., Song, Y., Luo, P.: DiffusionDet: diffusion model for object detection. arXiv preprint arXiv:2211.09788 (2022)","DOI":"10.1109\/ICCV51070.2023.01816"},{"key":"15_CR12","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"15_CR13","first-page":"22243","volume":"33","author":"T Chen","year":"2020","unstructured":"Chen, T., Kornblith, S., Swersky, K., Norouzi, M., Hinton, G.E.: Big self-supervised models are strong semi-supervised learners. Adv. Neural. Inf. Process. Syst. 33, 22243\u201322255 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR14","unstructured":"Chen, X., Duan, Y., Houthooft, R., Schulman, J., Sutskever, I., Abbeel, P.: InfoGAN: interpretable representation learning by information maximizing generative adversarial nets. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"15_CR15","unstructured":"Chen, X., Fan, H., Girshick, R.B., He, K.: Improved baselines with momentum contrastive learning. CoRR abs\/2003.04297 (2020). https:\/\/arxiv.org\/abs\/2003.04297"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Chen, X., He, K.: Exploring simple Siamese representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15750\u201315758 (2021)","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"15_CR17","doi-asserted-by":"crossref","unstructured":"Chen*, X., Xie*, S., He, K.: An empirical study of training self-supervised vision transformers. arXiv preprint arXiv:2104.02057 (2021)","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"15_CR19","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis (2021)"},{"key":"15_CR20","unstructured":"Donahue, J., Kr\u00e4henb\u00fchl, P., Darrell, T.: Adversarial feature learning. arXiv preprint arXiv:1605.09782 (2016)"},{"key":"15_CR21","unstructured":"Donahue, J., Simonyan, K.: Large scale adversarial representation learning. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"15_CR22","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16\u00a0$$\\times $$\u00a016 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"15_CR23","unstructured":"Dumoulin, V., et al.: Adversarially learned inference. arXiv preprint arXiv:1606.00704 (2016)"},{"issue":"11","key":"15_CR24","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"15_CR25","unstructured":"Grill, J., et al.: Bootstrap your own latent: a new approach to self-supervised learning. CoRR abs\/2006.07733 (2020). https:\/\/arxiv.org\/abs\/2006.07733"},{"key":"15_CR26","doi-asserted-by":"crossref","unstructured":"Gupta, K., Singh, S., Shrivastava, A.: PatchVAE: learning local latent codes for recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2020","DOI":"10.1109\/CVPR42600.2020.00480"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Gwilliam, M., Shrivastava, A.: Beyond supervised vs. unsupervised: representative benchmarking and analysis of image representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9642\u20139652, June 2022","DOI":"10.1109\/CVPR52688.2022.00942"},{"key":"15_CR28","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.B.: Masked autoencoders are scalable vision learners. CoRR abs\/2111.06377 (2021). https:\/\/arxiv.org\/abs\/2111.06377"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN (2018)","DOI":"10.1109\/ICCV.2017.322"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition (2015)","DOI":"10.1109\/CVPR.2016.90"},{"key":"15_CR31","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR32","unstructured":"Huang, Z., et al.: Contrastive masked autoencoders are stronger vision learners (2022)"},{"key":"15_CR33","unstructured":"Jahanian, A., Puig, X., Tian, Y., Isola, P.: Generative models as a data source for multiview representation learning. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=qhAeZjs7dCL"},{"key":"15_CR34","unstructured":"Karras, T., Aila, T., Laine, S., Lehtinen, J.: Progressive growing of GANs for improved quality, stability, and variation. arXiv preprint arXiv:1710.10196 (2017)"},{"key":"15_CR35","first-page":"12104","volume":"33","author":"T Karras","year":"2020","unstructured":"Karras, T., Aittala, M., Hellsten, J., Laine, S., Lehtinen, J., Aila, T.: Training generative adversarial networks with limited data. Adv. Neural. Inf. Process. Syst. 33, 12104\u201312114 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR36","first-page":"852","volume":"34","author":"T Karras","year":"2021","unstructured":"Karras, T., et al.: Alias-free generative adversarial networks. Adv. Neural. Inf. Process. Syst. 34, 852\u2013863 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR37","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aila, T.: A style-based generator architecture for generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2019","DOI":"10.1109\/CVPR.2019.00453"},{"key":"15_CR38","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., Aila, T.: Analyzing and improving the image quality of StyleGAN. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8110\u20138119 (2020)","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"15_CR39","unstructured":"Khosla, A., Jayadevaprakash, N., Yao, B., Fei-Fei, L.: Novel dataset for fine-grained image categorization. In: First Workshop on Fine-Grained Visual Categorization, IEEE Conference on Computer Vision and Pattern Recognition, Colorado Springs, CO, June 2011"},{"key":"15_CR40","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization (2017)"},{"key":"15_CR41","unstructured":"Kornblith, S., Norouzi, M., Lee, H., Hinton, G.: Similarity of neural network representations revisited. In: Chaudhuri, K., Salakhutdinov, R. (eds.) Proceedings of the 36th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a097, pp. 3519\u20133529. PMLR, 9\u201315 June 2019. https:\/\/proceedings.mlr.press\/v97\/kornblith19a.html"},{"key":"15_CR42","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3D object representations for fine-grained categorization. In: 4th International IEEE Workshop on 3D Representation and Recognition (3dRR-13), Sydney, Australia (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"key":"15_CR43","unstructured":"Krizhevsky, A., Hinton, G., et\u00a0al.: Learning multiple layers of features from tiny images (2009)"},{"key":"15_CR44","doi-asserted-by":"crossref","unstructured":"Li, A.C., Prabhudesai, M., Duggal, S., Brown, E.L., Pathak, D.: Your diffusion model is secretly a zero-shot classifier. In: ICML 2023 Workshop on Structured Probabilistic Inference & Generative Modeling (2023). https:\/\/openreview.net\/forum?id=Ck3yXRdQXD","DOI":"10.1109\/ICCV51070.2023.00210"},{"key":"15_CR45","unstructured":"Li, C., et al.: Efficient self-supervised vision transformers for representation learning (2022)"},{"key":"15_CR46","doi-asserted-by":"crossref","unstructured":"Li, D., et al.: DreamTeacher: pretraining image backbones with deep generative models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 16698\u201316708, October 2023","DOI":"10.1109\/ICCV51070.2023.01531"},{"key":"15_CR47","doi-asserted-by":"crossref","unstructured":"Li, D., Ling, H., Kim, S.W., Kreis, K., Fidler, S., Torralba, A.: BigDatasetGAN: synthesizing ImageNet with pixel-wise annotations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21330\u201321340 (2022)","DOI":"10.1109\/CVPR52688.2022.02064"},{"key":"15_CR48","doi-asserted-by":"crossref","unstructured":"Li, T., Chang, H., Mishra, S.K., Zhang, H., Katabi, D., Krishnan, D.: MAGE: masked generative encoder to unify representation learning and image synthesis (2022)","DOI":"10.1109\/CVPR52729.2023.00213"},{"key":"15_CR49","doi-asserted-by":"publisher","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, 6\u201312 September 2014, Proceedings, Part V 13. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"15_CR50","unstructured":"Maji, S., Kannala, J., Rahtu, E., Blaschko, M., Vedaldi, A.: Fine-grained visual classification of aircraft. Technical report (2013)"},{"key":"15_CR51","doi-asserted-by":"crossref","unstructured":"Misra, I., Maaten, L.V.D.: Self-supervised learning of pretext-invariant representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6707\u20136717 (2020)","DOI":"10.1109\/CVPR42600.2020.00674"},{"key":"15_CR52","unstructured":"Mnmoustafa, M.A.: Tiny imagenet (2017). https:\/\/kaggle.com\/competitions\/tiny-imagenet"},{"key":"15_CR53","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning, pp. 8162\u20138171. PMLR (2021)"},{"key":"15_CR54","unstructured":"Nie, W., et al.: Semi-supervised StyleGAN for disentanglement learning. In: Proceedings of the 37th International Conference on Machine Learning, pp. 7360\u20137369 (2020)"},{"key":"15_CR55","doi-asserted-by":"crossref","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: Indian Conference on Computer Vision, Graphics and Image Processing, December 2008","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"15_CR56","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46466-4_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Noroozi","year":"2016","unstructured":"Noroozi, M., Favaro, P.: Unsupervised learning of visual representations by solving jigsaw puzzles. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 69\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_5"},{"key":"15_CR57","unstructured":"Oquab, M., et al.: DINOv2: learning robust visual features without supervision (2023)"},{"key":"15_CR58","doi-asserted-by":"crossref","unstructured":"Pang, B., Zhang, Y., Li, Y., Cai, J., Lu, C.: Unsupervised visual representation learning by synchronous momentum grouping (2022)","DOI":"10.1007\/978-3-031-20056-4_16"},{"key":"15_CR59","doi-asserted-by":"crossref","unstructured":"Pathak, D., Krahenbuhl, P., Donahue, J., Darrell, T., Efros, A.A.: Context encoders: feature learning by inpainting. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2536\u20132544 (2016)","DOI":"10.1109\/CVPR.2016.278"},{"key":"15_CR60","doi-asserted-by":"crossref","unstructured":"Pidhorskyi, S., Adjeroh, D.A., Doretto, G.: Adversarial latent autoencoders. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14104\u201314113 (2020)","DOI":"10.1109\/CVPR42600.2020.01411"},{"key":"15_CR61","doi-asserted-by":"crossref","unstructured":"Pnvr, K., Singh, B., Ghosh, P., Siddiquie, B., Jacobs, D.: LD-ZNet: a latent diffusion approach for text-based image segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4157\u20134168 (2023)","DOI":"10.1109\/ICCV51070.2023.00384"},{"key":"15_CR62","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"15_CR63","unstructured":"Razavi, A., Van\u00a0den Oord, A., Vinyals, O.: Generating diverse high-fidelity images with VQ-VAE-2. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"15_CR64","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"15_CR65","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR66","doi-asserted-by":"crossref","unstructured":"Sariyildiz, M.B., Alahari, K., Larlus, D., Kalantidis, Y.: Fake it till you make it: learning transferable representations from synthetic ImageNet clones. In: CVPR 2023-IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1\u201311 (2023)","DOI":"10.1109\/CVPR52729.2023.00774"},{"key":"15_CR67","doi-asserted-by":"crossref","unstructured":"Sauer, A., Schwarz, K., Geiger, A.: StyleGAN-XL: scaling StyleGAN to large diverse datasets. vol. abs\/2201.00273 (2022). https:\/\/arxiv.org\/abs\/2201.00273","DOI":"10.1145\/3528233.3530738"},{"key":"15_CR68","doi-asserted-by":"publisher","unstructured":"Shrivastava, A., Gupta, A.: Contextual priming and feedback for faster R-CNN. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, 11\u201314 October 2016, Proceedings, Part I 14, pp. 330\u2013348. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_20","DOI":"10.1007\/978-3-319-46448-0_20"},{"key":"15_CR69","unstructured":"Tang, L., Jia, M., Wang, Q., Phoo, C.P., Hariharan, B.: Emergent correspondence from image diffusion. arXiv preprint arXiv:2306.03881 (2023)"},{"key":"15_CR70","unstructured":"Tomasev, N., et al.: Pushing the limits of self-supervised ResNets: can we outperform supervised learning without labels on ImageNet? (2022)"},{"key":"15_CR71","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"268","DOI":"10.1007\/978-3-030-58607-2_16","volume-title":"Computer Vision \u2013 ECCV 2020","author":"W Van Gansbeke","year":"2020","unstructured":"Van Gansbeke, W., Vandenhende, S., Georgoulis, S., Proesmans, M., Van Gool, L.: SCAN: learning to classify images without labels. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12355, pp. 268\u2013285. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58607-2_16"},{"key":"15_CR72","doi-asserted-by":"crossref","unstructured":"Van\u00a0Horn, G., et al.: Building a bird recognition app and large scale dataset with citizen scientists: the fine print in fine-grained dataset collection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2015","DOI":"10.1109\/CVPR.2015.7298658"},{"key":"15_CR73","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: The Caltech-UCSD Birds-200-2011 dataset. Technical report. CNS-TR-2011-001, California Institute of Technology (2011)"},{"key":"15_CR74","doi-asserted-by":"crossref","unstructured":"Walmer, M., Suri, S., Gupta, K., Shrivastava, A.: Teaching matters: investigating the role of supervision in vision transformers (2023)","DOI":"10.1109\/CVPR52729.2023.00723"},{"key":"15_CR75","doi-asserted-by":"crossref","unstructured":"Xiang, W., Yang, H., Huang, D., Wang, Y.: Denoising diffusion autoencoders are unified self-supervised learners. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 15802\u201315812, October 2023","DOI":"10.1109\/ICCV51070.2023.01448"},{"key":"15_CR76","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"432","DOI":"10.1007\/978-3-030-01228-1_26","volume-title":"Computer Vision \u2013 ECCV 2018","author":"T Xiao","year":"2018","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., Sun, J.: Unified perceptual parsing for scene understanding. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11209, pp. 432\u2013448. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01228-1_26"},{"key":"15_CR77","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., Mello, S.D.: Open-vocabulary panoptic segmentation with text-to-image diffusion models (2023)","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"15_CR78","doi-asserted-by":"crossref","unstructured":"Yin, C., et al.: Automatic generation of medical imaging diagnostic report with hierarchical recurrent neural network. In: 2019 IEEE International Conference on Data Mining (ICDM), pp. 728\u2013737. IEEE (2019)","DOI":"10.1109\/ICDM.2019.00083"},{"key":"15_CR79","unstructured":"Yu, J., et al.: Vector-quantized image modeling with improved VQGAN. In: International Conference on Learning Representations (2021)"},{"key":"15_CR80","unstructured":"Zbontar, J., Jing, L., Misra, I., LeCun, Y., Deny, S.: Barlow twins: self-supervised learning via redundancy reduction. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 12310\u201312320. PMLR, 18\u201324 July 2021. https:\/\/proceedings.mlr.press\/v139\/zbontar21a.html"},{"key":"15_CR81","unstructured":"Zhang, J., et al.: A tale of two features: stable diffusion complements DINO for zero-shot semantic correspondence (2023)"},{"key":"15_CR82","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"649","DOI":"10.1007\/978-3-319-46487-9_40","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Zhang","year":"2016","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Colorful image colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 649\u2013666. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_40"},{"key":"15_CR83","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: DatasetGAN: efficient labeled data factory with minimal human effort. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10145\u201310155 (2021)","DOI":"10.1109\/CVPR46437.2021.01001"},{"key":"15_CR84","doi-asserted-by":"crossref","unstructured":"Zhao, W., Rao, Y., Liu, Z., Liu, B., Zhou, J., Lu, J.: Unleashing text-to-image diffusion models for visual perception (2023)","DOI":"10.1109\/ICCV51070.2023.00527"},{"key":"15_CR85","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ADE20K dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 633\u2013641 (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"15_CR86","unstructured":"Zhou, J., et al.: iBOT: image BERT pre-training with online tokenizer (2022)"},{"key":"15_CR87","unstructured":"Zhou, P., Zhou, Y., Si, C., Yu, W., Ng, T.K., Yan, S.: Mugs: a multi-granular self-supervised learning framework (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73027-6_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T18:14:10Z","timestamp":1732558450000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73027-6_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,26]]},"ISBN":["9783031730269","9783031730276"],"references-count":87,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73027-6_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,26]]},"assertion":[{"value":"26 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}