{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T02:13:48Z","timestamp":1771899228751,"version":"3.50.1"},"reference-count":100,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T00:00:00Z","timestamp":1740355200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T00:00:00Z","timestamp":1740355200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s10994-025-06745-w","type":"journal-article","created":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T17:06:51Z","timestamp":1740416811000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Uncover the balanced geometry in long-tailed contrastive language-image pretraining"],"prefix":"10.1007","volume":"114","author":[{"given":"Zhihan","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Yushi","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Feng","family":"Hong","sequence":"additional","affiliation":[]},{"given":"Peisen","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Jiangchao","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Ya","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Yanfeng","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,24]]},"reference":[{"key":"6745_CR1","first-page":"9758","volume":"33","author":"H Alwassel","year":"2020","unstructured":"Alwassel, H., Mahajan, D., Korbar, B., Torresani, L., Ghanem, B., & Tran, D. (2020). Self-supervised learning by cross-modal audio-video clustering. Adv. Neural Inf. Proc. Syst., 33, 9758\u20139770.","journal-title":"Adv. Neural Inf. Proc. Syst."},{"key":"6745_CR2","unstructured":"Asano, Y.M., Patrick, M., Rupprecht, C., Vedaldi, A.: Labelling unlabelled videos from scratch with multi-modal self-supervision, in neural information processing systems (NeurIPS) (2020)"},{"key":"6745_CR3","unstructured":"Asano, Y.M., Rupprecht, C., Vedaldi, A.: Self-labelling via simultaneous clustering and representation learning, in: international conference on learning representations (ICLR) (2020)"},{"key":"6745_CR4","unstructured":"Assran, M., Balestriero, R., Duval, Q., Bordes, F., Misra, I., Bojanowski, P., Vincent, P., Rabbat, M., Ballas, N.: The hidden uniform cluster prior in self-supervised learning. In: The Eleventh International Conference on Learning Representations (2023)"},{"key":"6745_CR5","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., Zhou, J.: Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"6745_CR6","doi-asserted-by":"crossref","unstructured":"Caron, M., Bojanowski, P., Joulin, A., Douze, M.: Deep clustering for unsupervised learning of visual features, in proceedings of the European conference on computer vision (ECCV), pp. 132\u2013149 (2018)","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"6745_CR7","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations, in international conference on machine learning, pp. 1597\u20131607 (2020). PMLR"},{"key":"6745_CR8","first-page":"20","volume":"22","author":"D Chen","year":"2023","unstructured":"Chen, D., Wu, Z., Liu, F., Yang, Z., Zheng, S., Tan, Y., & Zhou, E. (2023). Protoclip: Prototypical contrastive language image pretraining. IEEE Trans. Neural Netw. Learn. Syst., 22, 20.","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"6745_CR9","doi-asserted-by":"crossref","unstructured":"Cherti, M., Beaumont, R., Wightman, R., Wortsman, M., Ilharco, G., Gordon, C., Schuhmann, C., Schmidt, L., Jitsev, J.: Reproducible scaling laws for contrastive language-image learning. in proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 2818\u20132829 (2023)","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"6745_CR10","doi-asserted-by":"crossref","unstructured":"Chua, T.-S., Tang, J., Hong, R., Li, H., Luo, Z., Zheng, Y.: Nus-wide: a real-world web image database from national university of singapore, in proceedings of the ACM international conference on image and video retrieval, pp. 1\u20139 (2009)","DOI":"10.1145\/1646396.1646452"},{"key":"6745_CR11","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., Vedaldi, A.: Describing textures in the wild, in proceedings of the IEEE conference on computer vision and pattern recognition (CVPR) (2014)","DOI":"10.1109\/CVPR.2014.461"},{"key":"6745_CR12","unstructured":"Coates, A., Ng, A., Lee, H.: An analysis of single-layer networks in unsupervised feature learning, in: proceedings of the fourteenth international conference on artificial intelligence and statistics, pp. 215\u2013223 (2011). JMLR Workshop and Conference Proceedings"},{"key":"6745_CR13","doi-asserted-by":"crossref","unstructured":"Cubuk, E.D., Zoph, B., Shlens, J., Le, Q.V.: Randaugment: Practical automated data augmentation with a reduced search space, in proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops, pp. 702\u2013703 (2020)","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"6745_CR14","doi-asserted-by":"crossref","unstructured":"Cui, Y., Jia, M., Lin, T.-Y., Song, Y., Belongie, S.: Class-balanced loss based on effective number of samples, In proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 9268\u20139277 (2019)","DOI":"10.1109\/CVPR.2019.00949"},{"key":"6745_CR15","first-page":"23","volume":"26","author":"M Cuturi","year":"2013","unstructured":"Cuturi, M. (2013). Sinkhorn distances: Lightspeed computation of optimal transport. Adv. Neural Inf. Proc. Syst., 26, 23.","journal-title":"Adv. Neural Inf. Proc. Syst."},{"key":"6745_CR16","unstructured":"Dehdashtian, S., Wang, L., Boddeti, V.N.: Fairerclip: Debiasing clip\u2019s zero-shot predictions using functions in rkhss. arXiv preprint arXiv:2403.15593 (2024)"},{"key":"6745_CR17","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition, pp. 248\u2013255 (2009). Ieee","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"6745_CR18","unstructured":"Dong, B., Zhou, P., Yan, S., Zuo, W.: Lpt: Long-tailed prompt tuning for image classification, in the eleventh international conference on learning representations (2023)"},{"key":"6745_CR19","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N.: An image is worth 16x16 words: transformers for image recognition at scale. ICLR (2021)"},{"key":"6745_CR20","unstructured":"Ermolov, A., Siarohin, A., Sangineto, E., Sebe, N.: Whitening for self-supervised representation learning, in international conference on machine learning, pp. 3015\u20133024 (2021). PMLR"},{"key":"6745_CR21","unstructured":"Fang, A., Ilharco, G., Wortsman, M., Wan, Y., Shankar, V., Dave, A., Schmidt, L.: Data determines distributional robustness in contrastive language image pre-training (clip), in international conference on machine learning, pp. 6216\u20136234 (2022). PMLR"},{"issue":"43","key":"6745_CR22","doi-asserted-by":"crossref","first-page":"2103091118","DOI":"10.1073\/pnas.2103091118","volume":"118","author":"C Fang","year":"2021","unstructured":"Fang, C., He, H., Long, Q., & Su, W. J. (2021). Exploring deep neural networks via layer-peeled model: Minority collapse in imbalanced training. Proc. Natl. Acad. Sci., 118(43), 2103091118.","journal-title":"Proc. Natl. Acad. Sci."},{"issue":"4","key":"6745_CR23","doi-asserted-by":"crossref","first-page":"594","DOI":"10.1109\/TPAMI.2006.79","volume":"28","author":"L Fei-Fei","year":"2006","unstructured":"Fei-Fei, L., Fergus, R., & Perona, P. (2006). One-shot learning of object categories. IEEE Trans. Patt. Anal. Mach. Intell., 28(4), 594\u2013611.","journal-title":"IEEE Trans. Patt. Anal. Mach. Intell."},{"issue":"2","key":"6745_CR24","doi-asserted-by":"crossref","first-page":"179","DOI":"10.1111\/j.1469-1809.1936.tb02137.x","volume":"7","author":"RA Fisher","year":"1936","unstructured":"Fisher, R. A. (1936). The use of multiple measurements in taxonomic problems. Ann. Eugen., 7(2), 179\u2013188.","journal-title":"Ann. Eugen."},{"key":"6745_CR25","unstructured":"Galanti, T., Gy\u00f6rgy, A., Hutter, M.: On the role of neural collapse in transfer learning. In: International Conference on Learning Representations (2021)"},{"key":"6745_CR26","doi-asserted-by":"crossref","unstructured":"Goodfellow, I.J., Erhan, D., Carrier, P.L., Courville, A., Mirza, M., Hamner, B., Cukierski, W., Tang, Y., Thaler, D., Lee, D.-H., et\u00a0al.: Challenges in representation learning: A report on three machine learning contests. In: Neural Information Processing: 20th International Conference, ICONIP 2013, Daegu, Korea, November 3-7, 2013. Proceedings, Part III 20, pp. 117\u2013124 (2013). Springer","DOI":"10.1007\/978-3-642-42051-1_16"},{"key":"6745_CR27","unstructured":"Graf, F., Hofer, C., Niethammer, M., Kwitt, R.: Dissecting supervised constrastive learning. In: international conference on machine learning, pp. 3821\u20133830 (2021). PMLR"},{"key":"6745_CR28","unstructured":"Han, X., Papyan, V., Donoho, D.L.: Neural collapse under mse loss: Proximity to and dynamics on the central path. In: international conference on learning representations (2021)"},{"key":"6745_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition, in proceedings of the IEEE conference on computer vision and pattern recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"7","key":"6745_CR30","doi-asserted-by":"crossref","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","volume":"12","author":"P Helber","year":"2019","unstructured":"Helber, P., Bischke, B., Dengel, A., & Borth, D. (2019). Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification. IEEE J. Select. Top. Appl. Earth Observ. Remote Sens., 12(7), 2217\u20132226.","journal-title":"IEEE J. Select. Top. Appl. Earth Observ. Remote Sens."},{"key":"6745_CR31","doi-asserted-by":"crossref","unstructured":"Houben, S., Stallkamp, J., Salmen, J., Schlipsing, M., Igel, C.: Detection of traffic signs in real-world images: The German Traffic Sign Detection Benchmark, in international joint conference on neural networks (2013)","DOI":"10.1109\/IJCNN.2013.6706807"},{"key":"6745_CR32","doi-asserted-by":"publisher","unstructured":"Ilharco, G., Wortsman, M., Wightman, R., Gordon, C., Carlini, N., Taori, R., Dave, A., Shankar, V., Namkoong, H., Miller, J., Hajishirzi, H., Farhadi, A., Schmidt, L.: OpenCLIP. If you use this software, please cite it as below. https:\/\/doi.org\/10.5281\/zenodo.5143773","DOI":"10.5281\/zenodo.5143773"},{"key":"6745_CR33","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision, international conference on machine learning, pp. 4904\u20134916 (2021). PMLR"},{"key":"6745_CR34","unstructured":"Jiang, Z., Chen, T., Mortazavi, B.J., Wang, Z.: Self-damaging contrastive learning, in international conference on machine learning, pp. 4927\u20134939 (2021). PMLR"},{"issue":"3","key":"6745_CR35","doi-asserted-by":"crossref","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","volume":"7","author":"J Johnson","year":"2019","unstructured":"Johnson, J., Douze, M., & J\u00e9gou, H. (2019). Billion-scale similarity search with gpus. IEEE Trans. Big Data, 7(3), 535\u2013547.","journal-title":"IEEE Trans. Big Data"},{"key":"6745_CR36","unstructured":"Kang, B., Xie, S., Rohrbach, M., Yan, Z., Gordo, A., Feng, J., Kalantidis, Y.: Decoupling representation and classifier for long-tailed recognition, in: international conference on learning representations (2019)"},{"key":"6745_CR37","unstructured":"Krizhevsky, A., Hinton, G., et al.: Learning multiple layers of features from tiny images (2009)"},{"issue":"1\u20132","key":"6745_CR38","doi-asserted-by":"crossref","first-page":"83","DOI":"10.1002\/nav.3800020109","volume":"2","author":"HW Kuhn","year":"1955","unstructured":"Kuhn, H. W. (1955). The hungarian method for the assignment problem. Naval Res. Log. Quart., 2(1\u20132), 83\u201397.","journal-title":"Naval Res. Log. Quart."},{"key":"6745_CR39","unstructured":"Kukleva, A., B\u00f6hle, M., Schiele, B., Kuehne, H., Rupprecht, C.: Temperature schedules for self-supervised contrastive methods on long-tail data, in the eleventh international conference on learning representations (2023)"},{"issue":"11","key":"6745_CR40","doi-asserted-by":"crossref","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., & Haffner, P. (1998). Gradient-based learning applied to document recognition. Proc. IEEE, 86(11), 2278\u20132324.","journal-title":"Proc. IEEE"},{"key":"6745_CR41","doi-asserted-by":"crossref","unstructured":"Li, T., Cao, P., Yuan, Y., Fan, L., Yang, Y., Feris, R.S., Indyk, P., Katabi, D.: Targeted supervised contrastive learning for long-tailed recognition, in proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6918\u20136928 (2022)","DOI":"10.1109\/CVPR52688.2022.00679"},{"key":"6745_CR42","doi-asserted-by":"crossref","unstructured":"Li, Y., Fan, H., Hu, R., Feichtenhofer, C., He, K.: Scaling language-image pre-training via masking, in proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 23390\u201323400 (2023)","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"6745_CR43","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. in international conference on machine learning, pp. 19730\u201319742 (2023). PMLR"},{"key":"6745_CR44","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation, in international conference on machine learning, pp. 12888\u201312900 (2022). PMLR"},{"key":"6745_CR45","unstructured":"Li, Y., Liang, F., Zhao, L., Cui, Y., Ouyang, W., Shao, J., Yu, F., Yan, J.: Supervision exists everywhere: a data efficient contrastive language-image pre-training paradigm, in international conference on learning representations (2022)"},{"key":"6745_CR46","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"6745_CR47","doi-asserted-by":"crossref","unstructured":"Liang, P.P., Zadeh, A., Morency, L.-P.: Foundations & trends in multimodal machine learning: Principles, challenges, and open questions. ACM Computing Surveys (2023)","DOI":"10.1145\/3610661.3617602"},{"key":"6745_CR48","first-page":"19645","volume":"35","author":"X Liang","year":"2022","unstructured":"Liang, X., Wu, Y., Han, J., Xu, H., Xu, C., & Liang, X. (2022). Effective adaptation in multi-task co-training for unified autonomous driving. Adv. Neural Inf. Process. Syst., 35, 19645\u201319658.","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"6745_CR49","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection, in proceedings of the IEEE international conference on computer vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"6745_CR50","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: common objects in context, in computer vision\u2013ECCV 2014: 13th European conference, Zurich, Switzerland, September 6-12, 2014, proceedings, part V 13, pp. 740\u2013755 (2014). Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"6745_CR51","doi-asserted-by":"crossref","unstructured":"Lin, W., Zhao, Z., Zhang, X., Wu, C., Zhang, Y., Wang, Y., Xie, W.: Pmc-clip: Contrastive language-image pre-training using biomedical documents. In: international conference on medical image computing and computer-assisted intervention, pp. 525\u2013536 (2023). Springer","DOI":"10.1007\/978-3-031-43993-3_51"},{"key":"6745_CR52","unstructured":"Liu, H., HaoChen, J.Z., Gaidon, A., Ma, T.: Self-supervised learning is more robust to dataset imbalance, in international conference on learning representations (2021)"},{"key":"6745_CR53","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.-Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s, proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"6745_CR54","doi-asserted-by":"crossref","unstructured":"Liu, Z., Miao, Z., Zhan, X., Wang, J., Gong, B., Yu, S.X.: Large-scale long-tailed recognition in an open world, in proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 2537\u20132546 (2019)","DOI":"10.1109\/CVPR.2019.00264"},{"key":"6745_CR55","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., Stoyanov, V.: Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"6745_CR56","unstructured":"Liu, X., Zhang, J., Hu, T., Cao, H., Yao, Y., Pan, L.: Inducing neural collapse in deep long-tailed learning, in international conference on artificial intelligence and statistics, pp. 11534\u201311544 (2023). PMLR"},{"key":"6745_CR57","doi-asserted-by":"crossref","unstructured":"Long, A., Yin, W., Ajanthan, T., Nguyen, V., Purkait, P., Garg, R., Blair, A., Shen, C., Hengel, A.: Retrieval augmented classification for long-tail visual recognition, in proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6959\u20136969 (2022)","DOI":"10.1109\/CVPR52688.2022.00683"},{"key":"6745_CR58","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization, in international conference on learning representations (2019)"},{"key":"6745_CR59","unstructured":"Lu, Z.: A theory of multimodal learning. Advances in Neural Information Processing Systems 36 (2024)"},{"key":"6745_CR60","unstructured":"Ma, T., Geng, S., Wang, M., Shao, J., Lu, J., Li, H., Gao, P., Qiao, Y.: A simple long-tailed recognition baseline via vision-language model. arXiv preprint arXiv:2111.14745 (2021)"},{"key":"6745_CR61","unstructured":"Menon, A.K., Jayasumana, S., Rawat, A.S., Jain, H., Veit, A., Kumar, S.: Long-tail learning via logit adjustment, in international conference on learning representations (2021)"},{"key":"6745_CR62","unstructured":"Micikevicius, P., Narang, S., Alben, J., Diamos, G., Elsen, E., Garcia, D., Ginsburg, B., Houston, M., Kuchaiev, O., Venkatesh, G., et\u00a0al.: Mixed precision training, in international conference on learning representations (2018)"},{"key":"6745_CR63","unstructured":"Mixon, D.G., Parshall, H., Pi, J.: Neural collapse with unconstrained features. arXiv preprint arXiv:2011.11619 (2020)"},{"key":"6745_CR64","doi-asserted-by":"crossref","unstructured":"Mu, N., Kirillov, A., Wagner, D., Xie, S.: Slip: Self-supervision meets language-image pre-training, in European conference on computer vision, pp. 529\u2013544 (2022). Springer","DOI":"10.1007\/978-3-031-19809-0_30"},{"key":"6745_CR65","doi-asserted-by":"crossref","unstructured":"Ng, E.G., Pang, B., Sharma, P., Soricut, R.: Understanding guided image captioning performance across domains. arXiv preprint arXiv:2012.02339 (2020)","DOI":"10.18653\/v1\/2021.conll-1.14"},{"key":"6745_CR66","doi-asserted-by":"crossref","unstructured":"Nilsback, M.-E., Zisserman, A.: Automated flower classification over a large number of classes. In: 2008 Sixth Indian conference on computer vision, graphics & image processing, pp. 722\u2013729 (2008). IEEE","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"6745_CR67","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"6745_CR68","unstructured":"OpenAI, R.: Gpt-4 technical report. arxiv 2303.08774. View in Article 2(5) (2023)"},{"key":"6745_CR69","unstructured":"Oquab, M., Darcet, T., Moutakanni, T., Vo, H.V., Szafraniec, M., Khalidov, V., Fernandez, P., HAZIZA, D., Massa, F., El-Nouby, A., et al.: Dinov2: Learning robust visual features without supervision. Transactions on Machine Learning Research (2023)"},{"issue":"40","key":"6745_CR70","doi-asserted-by":"crossref","first-page":"24652","DOI":"10.1073\/pnas.2015509117","volume":"117","author":"V Papyan","year":"2020","unstructured":"Papyan, V., Han, X., & Donoho, D. L. (2020). Prevalence of neural collapse during the terminal phase of deep learning training. Proc. Natl. Acad. Sci., 117(40), 24652\u201324663.","journal-title":"Proc. Natl. Acad. Sci."},{"key":"6745_CR71","doi-asserted-by":"crossref","unstructured":"Parashar, S., Lin, Z., Liu, T., Dong, X., Li, Y., Ramanan, D., Caverlee, J., Kong, S.: The neglected tails of vision-language models. arXiv preprint arXiv:2401.12425 (2024)","DOI":"10.1109\/CVPR52733.2024.01234"},{"key":"6745_CR72","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: proceedings of the IEEE international conference on computer vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"6745_CR73","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021). PMLR"},{"key":"6745_CR74","first-page":"4175","volume":"33","author":"J Ren","year":"2020","unstructured":"Ren, J., Yu, C., Ma, X., Zhao, H., Yi, S., et al. (2020). Balanced meta-softmax for long-tailed visual recognition. Adv. Neural Inf. Proc. Syst., 33, 4175\u20134186.","journal-title":"Adv. Neural Inf. Proc. Syst."},{"key":"6745_CR75","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. in proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"6745_CR77","unstructured":"Sablayrolles, A., Douze, M., Schmid, C., J\u00e9gou, H.: Spreading vectors for similarity search, in international conference on learning representations (2019)"},{"key":"6745_CR78","unstructured":"Schuhmann, C., Vencu, R., Beaumont, R., Kaczmarczyk, R., Mullis, C., Katta, A., Coombes, T., Jitsev, J., Komatsuzaki, A.: Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"6745_CR79","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., et al. (2022). Laion-5b: An open large-scale dataset for training next generation image-text models. Adv. Neural Inf. Process. Syst., 35, 25278\u201325294.","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"6745_CR80","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning, in proceedings of ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"6745_CR81","doi-asserted-by":"crossref","unstructured":"Shi, J.-X., Zhang, C., Wei, T., Li, Y.-F.: Efficient and long-tailed generalization for pre-trained vision-language model. In: proceedings of the 30th ACM SIGKDD conference on knowledge discovery and data mining, pp. 2663\u20132673 (2024)","DOI":"10.1145\/3637528.3671945"},{"key":"6745_CR82","unstructured":"Shridhar, M., Manuelli, L., Fox, D.: Cliport: What and where pathways for robotic manipulation. In: conference on robot learning, pp. 894\u2013906 (2022). PMLR"},{"key":"6745_CR83","first-page":"583","volume":"3","author":"A Strehl","year":"2002","unstructured":"Strehl, A., & Ghosh, J. (2002). Cluster ensembles\u2013a knowledge reuse framework for combining multiple partitions. J. Mach. Learn. Res., 3, 583\u2013617.","journal-title":"J. Mach. Learn. Res."},{"key":"6745_CR84","doi-asserted-by":"crossref","unstructured":"Tian, Y., Henaff, O.J., Oord, A.: Divide and contrast: Self-supervised learning from uncurated data, in proceedings of the IEEE\/CVF international conference on computer vision, pp. 10063\u201310074 (2021)","DOI":"10.1109\/ICCV48922.2021.00991"},{"key":"6745_CR85","doi-asserted-by":"crossref","unstructured":"Tian, C., Wang, W., Zhu, X., Dai, J., Qiao, Y.: Vl-ltr: Learning class-wise visual-linguistic representation for long-tailed visual recognition, in European conference on computer vision, pp. 73\u201391 (2022). Springer","DOI":"10.1007\/978-3-031-19806-9_5"},{"key":"6745_CR86","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Proc. Syst. 30: (2017)"},{"key":"6745_CR87","unstructured":"Wang, T., Isola, P.: Understanding contrastive representation learning through alignment and uniformity on the hypersphere, in international conference on machine learning, pp. 9929\u20139939 (2020). PMLR"},{"key":"6745_CR88","unstructured":"Wang, Y., Zhang, Q., Wang, Y., Yang, J., Lin, Z.: Chaos is a ladder: a new theoretical understanding of contrastive learning via augmentation overlap, in international conference on learning representations (2021)"},{"issue":"1","key":"6745_CR89","doi-asserted-by":"crossref","first-page":"224","DOI":"10.1007\/s11263-023-01868-w","volume":"132","author":"Y Wang","year":"2024","unstructured":"Wang, Y., Yu, Z., Wang, J., Heng, Q., Chen, H., Ye, W., Xie, R., Xie, X., & Zhang, S. (2024). Exploring vision-language models for imbalanced learning. Int. J. Comput. Vis., 132(1), 224\u2013237.","journal-title":"Int. J. Comput. Vis."},{"key":"6745_CR90","doi-asserted-by":"crossref","unstructured":"Wu, B., Cheng, R., Zhang, P., Gao, T., Gonzalez, J.E., Vajda, P.: Data efficient language-supervised zero-shot recognition with optimal transport distillation, in international conference on learning representations (2022)","DOI":"10.1109\/CVPRW53098.2021.00348"},{"key":"6745_CR91","doi-asserted-by":"crossref","unstructured":"Wu, R., Papyan, V.: Linguistic collapse: Neural collapse in (large) language models. arXiv preprint arXiv:2405.17767 (2024)","DOI":"10.52202\/079017-4366"},{"key":"6745_CR92","doi-asserted-by":"crossref","unstructured":"Wu, C., Zhang, X., Zhang, Y., Wang, Y., Xie, W.: Medklip: Medical knowledge enhanced language-image pre-training for x-ray diagnosis. In proceedings of the IEEE\/CVF international conference on computer vision, pp. 21372\u201321383 (2023)","DOI":"10.1109\/ICCV51070.2023.01954"},{"key":"6745_CR93","doi-asserted-by":"crossref","first-page":"60","DOI":"10.1016\/j.neucom.2023.01.023","volume":"527","author":"L Xie","year":"2023","unstructured":"Xie, L., Yang, Y., Cai, D., & He, X. (2023). Neural collapse inspired attraction-repulsion-balanced loss for imbalanced learning. Neurocomputing, 527, 60\u201370.","journal-title":"Neurocomputing"},{"key":"6745_CR94","first-page":"37991","volume":"35","author":"Y Yang","year":"2022","unstructured":"Yang, Y., Chen, S., Li, X., Xie, L., Lin, Z., & Tao, D. (2022). Inducing neural collapse in imbalanced learning: Do we really need a learnable classifier at the end of deep neural network? Adv. Neural Inf. Process. Syst., 35, 37991\u201338002.","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"6745_CR95","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L.H., Zhou, L., Dai, X., Yuan, L., Li, Y., et\u00a0al.: Regionclip: Region-based language-image pretraining, in proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 16793\u201316803 (2022)","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"6745_CR96","doi-asserted-by":"crossref","unstructured":"Zhou, J., Dong, L., Gan, Z., Wang, L., Wei, F.: Non-contrastive learning meets language-image pre-training, in proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 11028\u201311038 (2023)","DOI":"10.1109\/CVPR52729.2023.01061"},{"key":"6745_CR97","unstructured":"Zhou, Z., Yao, J., Wang, Y.-F., Han, B., Zhang, Y.: Contrastive learning with boosted memorization, in international conference on machine learning, pp. 27367\u201327377 (2022). PMLR"},{"key":"6745_CR98","first-page":"20394","volume":"36","author":"Z Zhou","year":"2023","unstructured":"Zhou, Z., Yao, J., Hong, F., Zhang, Y., Han, B., & Wang, Y. (2023). Combating representation learning disparity with geometric harmonization. Adv. Neural Inf. Process. Syst., 36, 20394.","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"6745_CR99","doi-asserted-by":"crossref","unstructured":"Zhu, D., Li, Z., Zhang, M., Yuan, J., Liu, J., Kuang, K., Wu, C.: Neural collapse anchored prompt tuning for generalizable vision-language models, in proceedings of the 30th ACM SIGKDD conference on knowledge discovery and data mining, pp. 4631\u20134640 (2024)","DOI":"10.1145\/3637528.3671690"},{"key":"6745_CR100","first-page":"29820","volume":"34","author":"Z Zhu","year":"2021","unstructured":"Zhu, Z., Ding, T., Zhou, J., Li, X., You, C., Sulam, J., & Qu, Q. (2021). A geometric analysis of neural collapse with unconstrained features. Adv. Neural Inf. Proc. Syst., 34, 29820\u201329834.","journal-title":"Adv. Neural Inf. Proc. Syst."},{"key":"6745_CR101","doi-asserted-by":"crossref","unstructured":"Zhuang, Y., Wang, Y., Wu, F., Zhang, Y., Lu, W.: Supervised coupled dictionary learning with group structures for multi-modal retrieval, in proceedings of the AAAI conference on artificial intelligence, vol. 27, pp. 1070\u20131076 (2013)","DOI":"10.1609\/aaai.v27i1.8603"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-025-06745-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-025-06745-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-025-06745-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T01:02:41Z","timestamp":1771894961000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-025-06745-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,24]]},"references-count":100,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["6745"],"URL":"https:\/\/doi.org\/10.1007\/s10994-025-06745-w","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,24]]},"assertion":[{"value":"19 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 November 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 January 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 February 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no financial or non-financial interests to disclose that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"All datasets used in this work are available online and clearly cited. The data split for long-tailed sampling subsets will be available along with the code.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Data availability"}},{"value":"Not applicable.","order":6,"name":"Ethics","group":{"name":"EthicsHeading","label":"Materials availability"}},{"value":"The code of this work will be available after publication.","order":7,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code availability"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}],"article-number":"106"}}