{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T20:03:22Z","timestamp":1775160202317,"version":"3.50.1"},"reference-count":203,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2022,5,25]],"date-time":"2022-05-25T00:00:00Z","timestamp":1653436800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,5,25]],"date-time":"2022-05-25T00:00:00Z","timestamp":1653436800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2022,7]]},"DOI":"10.1007\/s11263-022-01622-8","type":"journal-article","created":{"date-parts":[[2022,5,25]],"date-time":"2022-05-25T06:02:40Z","timestamp":1653458560000},"page":"1837-1872","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":124,"title":["A Survey on Long-Tailed Visual Recognition"],"prefix":"10.1007","volume":"130","author":[{"given":"Lu","family":"Yang","sequence":"first","affiliation":[]},{"given":"He","family":"Jiang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1936-224X","authenticated-orcid":false,"given":"Qing","family":"Song","sequence":"additional","affiliation":[]},{"given":"Jun","family":"Guo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,5,25]]},"reference":[{"key":"1622_CR1","unstructured":"Abu-El-Haija, S., Kothari, N., Lee, J., Natsev, P., Toderici, G., Varadarajan, B., & Vijayanarasimhan, S. (2016). Youtube-8m: A large-scale video classification benchmark. arXiv:1609.08675"},{"key":"1622_CR2","doi-asserted-by":"crossref","unstructured":"An, X., Zhu, X., Xiao, Y., Wu, L., Zhang, M., Gao, Y., Qin, B., Zhang, D., & Fu, Y. (2020). Partial fc: Training 10 million identities on a single machine. arXiv:2010.05222","DOI":"10.1109\/ICCVW54120.2021.00166"},{"key":"1622_CR3","unstructured":"Anderson, C. (2006). The long tail: Why the future of business is selling less of more. Hachette Books."},{"key":"1622_CR4","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., & Gould, S. (2016). Spice: Semantic propositional image caption evaluation. In Proceedings of the European conference on computer vision (pp. 382\u2013398).","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1622_CR5","unstructured":"Andrej, K., George, T., Sanketh, S., Thomas, L., Rahul, S., & Li, F.F. (2014). Large-scale video classification with convolutional neural networks. In Proceedings of the IEEE international conference on computer vision (pp. 1725\u20131732)."},{"key":"1622_CR6","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., & Parikh, D. (2015). Vqa: Visual question answering. In Proceedings of the IEEE international conference on computer vision (pp. 2425\u20132433).","DOI":"10.1109\/ICCV.2015.279"},{"key":"1622_CR7","unstructured":"Brock, A., Jeff, D., & Karen, S. (2018). Large scale Gan training for high fidelity natural image synthesis. In International conference on learning representations."},{"key":"1622_CR8","unstructured":"Brown, T.\u00a0B., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.\u00a0D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., Agarwal, S., Herbert-Voss, A., Krueger, G., Henighan, T., Child, R., Ramesh, A., Ziegler, D., Wu, J., Winter, C., Hesse, C., Chen, M., Sigler, E., Litwin, M., Gray, S., Chess, B., Clark, J., Berner, C., McCandlish, S., Radford, A., Sutskever, I., & Amodei, D. (2020). Language models are few-shot learners. In Advances in neural information processing systems (pp. 1877\u20131901)."},{"key":"1622_CR9","doi-asserted-by":"publisher","first-page":"249","DOI":"10.1016\/j.neunet.2018.07.011","volume":"106","author":"M Buda","year":"2018","unstructured":"Buda, M., Maki, A., & Mazurowski, M. A. (2018). A systematic study of the class imbalance problem in convolutional neural networks. Neural Networks, 106, 249\u2013259.","journal-title":"Neural Networks"},{"key":"1622_CR10","unstructured":"Byrd, J., & Lipton, Z. (2019). What is the effect of importance weighting in deep learning? In International conference on machine learning (pp. 872\u2013881). PMLR."},{"key":"1622_CR11","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., & Ferrari, V. (2018). Coco-stuff: Thing and stuff classes in context. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1209\u20131218).","DOI":"10.1109\/CVPR.2018.00132"},{"key":"1622_CR12","unstructured":"Cao, K., Wei, C., Gaidon, A., Arechiga, N., & Ma, T. (2019). Learning imbalanced datasets with label-distribution-aware margin loss. In Advances in neural information processing systems (pp. 1567\u20131578)"},{"key":"1622_CR13","unstructured":"Castrup, H. (2001). Distributions for uncertainty analysis. In Proceedings of international dimensional workshop (pp. 1\u201312)."},{"key":"1622_CR14","unstructured":"Chang, N., Koushik, J., Tarr, M.\u00a0J., Hebert, M., & Wang, Y.\u00a0X. (2020). Alpha net: Adaptation with composition in classifier space. arXiv:2008.07073"},{"key":"1622_CR15","doi-asserted-by":"publisher","first-page":"321","DOI":"10.1613\/jair.953","volume":"16","author":"NV Chawla","year":"2002","unstructured":"Chawla, N. V., Bowyer, K. W., Hall, L. O., & Kegelmeyer, W. P. (2002). Smote: Synthetic minority over-sampling technique. Journal of Artificial Intelligence Research, 16, 321\u2013357.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"1622_CR16","unstructured":"Chen, X., Fan, H., Girshick, R., & He, K. (2020). Improved baselines with momentum contrastive learning. arXiv:2003.04297"},{"key":"1622_CR17","unstructured":"Chen, T., Kornblith, S., Norouzi, M., & Hinton, G. (2020). A simple framework for contrastive learning of visual representations. In International conference on machine learning (pp. 1597\u20131607). PMLR."},{"key":"1622_CR18","unstructured":"Cheng, B., Schwing, A.G., & Kirillov, A. (2021). Per-pixel classification is not all you need for semantic segmentation. arXiv:2107.06278"},{"key":"1622_CR19","doi-asserted-by":"crossref","unstructured":"Chou, H.\u00a0P., Chang, S.\u00a0C., Pan, J.\u00a0Y., Wei, W., & Juan, D.\u00a0C. (2020). Remix: Rebalanced mixup. In Proceedings of the European conference on computer vision (pp. 95\u2013110)","DOI":"10.1007\/978-3-030-65414-6_9"},{"key":"1622_CR20","doi-asserted-by":"crossref","unstructured":"Chu, P., Bian, X., Liu, S., & Ling, H. (2020). Feature space augmentation for long-tailed data. In Proceedings of the European conference on computer vision (pp. 694\u2013710).","DOI":"10.1007\/978-3-030-58526-6_41"},{"key":"1622_CR21","unstructured":"Contributors, M. (2020). Mmsegmentation: Openmmlab semantic segmentation toolbox and benchmark. https:\/\/github.com\/open-mmlab\/mmsegmentation"},{"key":"1622_CR22","doi-asserted-by":"crossref","unstructured":"Cubuk, E.\u00a0D., Zoph, B., Shlens, J., & Le, Q.\u00a0V. (2020). Randaugment: Practical automated data augmentation with a reduced search space. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops (pp. 702\u2013703).","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"1622_CR23","doi-asserted-by":"crossref","unstructured":"Cui, Y., Jia, M., Lin, T.Y., Song, Y., & Belongie, S. (2019). Class-balanced loss based on effective number of samples. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9268\u20139277).","DOI":"10.1109\/CVPR.2019.00949"},{"key":"1622_CR24","doi-asserted-by":"crossref","unstructured":"Cui, J., Liu, S., Tian, Z., & Jia, J. (2021). Reslt: Residual learning for long-tailed recognition. arXiv:2101.10633","DOI":"10.1109\/TPAMI.2022.3174892"},{"key":"1622_CR25","doi-asserted-by":"crossref","unstructured":"Cui, J., Zhong, Z., Liu, S., Yu, B., & Jia, J. (2021). Parametric contrastive learning. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 715\u2013724).","DOI":"10.1109\/ICCV48922.2021.00075"},{"key":"1622_CR26","unstructured":"Dave, A., Doll\u00e1r, P., Ramanan, D., Kirillov, A., & Girshick, R. (2021). Evaluating large-vocabulary object detectors: The devil is in the details. arXiv:2102.01066"},{"key":"1622_CR27","doi-asserted-by":"publisher","first-page":"482","DOI":"10.1093\/biomet\/41.3-4.482","volume":"41","author":"A David","year":"1954","unstructured":"David, A., Hartley, O., & Pearson, S. (1954). The distribution of the ratio, in a single normal sample, of range to standard deviation. Biometrika, 41, 482\u2013493.","journal-title":"Biometrika"},{"key":"1622_CR28","doi-asserted-by":"crossref","unstructured":"Davidson, L. (1999). Uncertainty in economics. In Uncertainty, international money, employment and theory (pp. 30\u201337).","DOI":"10.1007\/978-1-349-14991-9_2"},{"key":"1622_CR29","doi-asserted-by":"publisher","first-page":"55","DOI":"10.52041\/serj.v4i1.525","volume":"4","author":"R Delmas","year":"2005","unstructured":"Delmas, R., & Yan, L. (2005). Exploring students\u2019 conceptions of the standard deviation. Statistics Education Research Journal, 4, 55\u201382.","journal-title":"Statistics Education Research Journal"},{"key":"1622_CR30","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 248\u2013255)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1622_CR31","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., & Zafeiriou, S. (2019). Arcface: Additive angular margin loss for deep face recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4690\u20134699).","DOI":"10.1109\/CVPR.2019.00482"},{"key":"1622_CR32","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.patrec.2016.10.006","volume":"93","author":"D Devi","year":"2017","unstructured":"Devi, D., & Purkayastha, B. (2017). Redundancy-driven modified Tomek-link based undersampling: A solution to class imbalance. Pattern Recognition Letters, 93, 3\u201312.","journal-title":"Pattern Recognition Letters"},{"key":"1622_CR33","unstructured":"Devlin, J., Chang, M.\u00a0W., Lee, K., & Toutanova, K. (2019). Bert: Pre-training of deep bidirectional transformers for language understanding. In Annual conference of the North American chapter of the association for computational linguistics: Human Language Technologies (pp. 4171\u20134186)"},{"key":"1622_CR34","doi-asserted-by":"publisher","first-page":"e0240783","DOI":"10.1371\/journal.pone.0240783","volume":"15","author":"G Dina","year":"2020","unstructured":"Dina, G., Michael, J., David, H., Julio, D., & Robert, S. (2020). Decreasing median age of covid-19 cases in the united states\u2013changing epidemiology or changing surveillance? PLOS ONE, 15, e0240783.","journal-title":"PLOS ONE"},{"key":"1622_CR35","doi-asserted-by":"crossref","unstructured":"Dong, Q., Gong, S., & Zhu, X. (2017). Class rectification hard mining for imbalanced deep learning. In Proceedings of the IEEE international conference on computer vision (pp. 1851\u20131860).","DOI":"10.1109\/ICCV.2017.205"},{"key":"1622_CR36","doi-asserted-by":"publisher","first-page":"1367","DOI":"10.1109\/TPAMI.2018.2832629","volume":"41","author":"Q Dong","year":"2018","unstructured":"Dong, Q., Gong, S., & Zhu, X. (2018). Imbalanced deep learning by minority class incremental rectification. IEEE Transactions on Pattern Analysis and Machine Intelligence, 41, 1367\u20131381.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1622_CR37","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., & Houlsby, N. (2021). An image is worth $$16{\\times }16$$ words: Transformers for image recognition at scale. In International conference on learning representations."},{"key":"1622_CR38","unstructured":"Dvir, S., & Gal, C. (2021). Distributional robustness loss for long-tail learning. arXiv:2104.03066"},{"key":"1622_CR39","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C. K., Winn, J., & Zisserman, A. (2010). The pascal visual object classes (voc) challenge. International Journal of Computer Vision, 88, 303\u2013338.","journal-title":"International Journal of Computer Vision"},{"key":"1622_CR40","doi-asserted-by":"crossref","unstructured":"Fan, Q., Zhuo, W., Tang, C.\u00a0K., & Tai, Y.\u00a0W. (2020). Few-shot object detection with attention-RPN and multi-relation detector. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4013\u20134022).","DOI":"10.1109\/CVPR42600.2020.00407"},{"key":"1622_CR41","doi-asserted-by":"publisher","first-page":"1656","DOI":"10.1378\/chest.117.6.1656","volume":"117","author":"A Fogarty","year":"2000","unstructured":"Fogarty, A., Richard, H., & John, B. (2000). International comparison of median age at death from cystic fibrosis. Chest, 117, 1656\u20131660.","journal-title":"Chest"},{"key":"1622_CR42","doi-asserted-by":"publisher","first-page":"1423","DOI":"10.1080\/01621459.1996.10476710","volume":"91","author":"M Ghosh","year":"1996","unstructured":"Ghosh, M., Nangia, N., & Kim, D. H. (1996). Estimation of median income of four-person families: A Bayesian time series approach. Journal of the American Statistical Association, 91, 1423\u20131431.","journal-title":"Journal of the American Statistical Association"},{"key":"1622_CR43","doi-asserted-by":"crossref","unstructured":"Gidaris, S., & Komodakis, N. (2018). Dynamic few-shot visual learning without forgetting. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4367\u20134375).","DOI":"10.1109\/CVPR.2018.00459"},{"key":"1622_CR44","unstructured":"Gini, C. (1912). Variabilit\u00e0 e mutabilit\u00e0. Memorie di metodologica statistica."},{"key":"1622_CR45","doi-asserted-by":"crossref","unstructured":"Girshick, R. (2015). Fast R-CNN. In Proceedings of the IEEE international conference on computer vision (pp. 1440\u20131448).","DOI":"10.1109\/ICCV.2015.169"},{"key":"1622_CR46","unstructured":"Goodfellow, I., Mehdi\u00a0Mirza, J.\u00a0P.\u00a0A., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2014). Generative adversarial nets. In Advances in neural information processing systems."},{"key":"1622_CR47","doi-asserted-by":"crossref","unstructured":"Goyal, R., Kahou, S.E., Michalski, V., Materzynska, J., Westphal, S., Heuna, K., Haenel, V., Fruend, I., Yianilos, P., Mueller-Freitag, M., Hoppe, F., Thurau, C., Bax, I., & Memisevic, R. (2017). The \u201csomething something\u201d video database for learning and evaluating visual common sense. In Proceedings of the IEEE international conference on computer vision (pp. 5842\u20135850).","DOI":"10.1109\/ICCV.2017.622"},{"key":"1622_CR48","unstructured":"Gu, X., Lin, T.\u00a0Y., Kuo, W., & Cui, Y. (2021). Zero-shot detection via vision and language knowledge distillation. arXiv:2104.13921"},{"key":"1622_CR49","unstructured":"Gui, S., Wang, H., Yang, H., Wang, C.\u00a0Y.\u00a0Z., & Liu., J. (2019). Model compression with adversarial robustness: A unified optimization framework. In Advances in neural information processing systems (pp. 1283\u20131294)."},{"key":"1622_CR50","doi-asserted-by":"crossref","unstructured":"Guo, Y., Zhang, L., Hu, Y., He, X., & Gao, J. (2016). Ms-celeb-1m: A dataset and benchmark for large-scale face recognition. In Proceedings of the European conference on computer vision (pp. 87\u2013102).","DOI":"10.1007\/978-3-319-46487-9_6"},{"key":"1622_CR51","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., & Girshick, R. (2019). Lvis: A dataset for large vocabulary instance segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 5356\u20135364).","DOI":"10.1109\/CVPR.2019.00550"},{"key":"1622_CR52","unstructured":"Hadsell, R., Chopra, S., & LeCun, Y. (2006) Dimensionality reduction by learning an invariant mapping. In Proceedings of the IEEE conference on computer vision and pattern recognition."},{"key":"1622_CR53","doi-asserted-by":"publisher","first-page":"220","DOI":"10.1016\/j.eswa.2016.12.035","volume":"73","author":"G Haixiang","year":"2017","unstructured":"Haixiang, G., Yijing, L., Shang, J., Mingyun, G., Yuanyue, H., & Bing, G. (2017). Learning from class-imbalanced data: Review of methods and applications. Expert Systems with Applications, 73, 220\u2013239.","journal-title":"Expert Systems with Applications"},{"key":"1622_CR54","doi-asserted-by":"crossref","unstructured":"Han, H., Wang, W.\u00a0Y., & Mao, B.\u00a0H. (2005). Borderline-smote: A new over-sampling method in imbalanced data sets learning. In International Conference on Intelligent Computing (pp. 878\u2013887). Springer.","DOI":"10.1007\/11538059_91"},{"key":"1622_CR55","unstructured":"He, H., Bai, Y., Garcia, E.\u00a0A., & Li, S. (2008) Adasyn: Adaptive synthetic sampling approach for imbalanced learning. In 2008 IEEE international joint conference on neural networks (pp. 1322\u20131328)."},{"key":"1622_CR56","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., & Girshick, R. (2020). Momentum contrast for unsupervised visual representation learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9729\u20139738).","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"1622_CR57","doi-asserted-by":"crossref","unstructured":"He, Y.\u00a0Y., Wu, J., & Wei, X.\u00a0S. (2021). Distilling virtual examples for long-tailed recognition. arXiv:2103.15042","DOI":"10.1109\/ICCV48922.2021.00030"},{"key":"1622_CR58","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"key":"1622_CR59","doi-asserted-by":"publisher","first-page":"1263","DOI":"10.1109\/TKDE.2008.239","volume":"21","author":"H He","year":"2009","unstructured":"He, H., & Garcia, E. A. (2009). Learning from imbalanced data. IEEE Transactions on Knowledge and Data Engineering, 21, 1263\u20131284.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"1622_CR60","unstructured":"Hinton, G., Vinyals, O., & Dean, J. (2015). Distilling the knowledge in a neural network. arXiv:1503.02531"},{"key":"1622_CR61","doi-asserted-by":"crossref","unstructured":"Hong, Y., Han, S., Choi, K., Seo, S., Kim, B., & Chang, B. (2021). Disentangling label distribution for long-tailed visual recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 6626\u20136636).","DOI":"10.1109\/CVPR46437.2021.00656"},{"key":"1622_CR62","doi-asserted-by":"crossref","unstructured":"Hsieh, T.\u00a0I., Robb, E., Chen, H.\u00a0T., & Huang, J.\u00a0B. (2021). Droploss for long-tail instance segmentation. In Proceedings of the AAAI conference on artificial intelligence (pp. 1549\u20131557).","DOI":"10.1609\/aaai.v35i2.16246"},{"key":"1622_CR63","doi-asserted-by":"crossref","unstructured":"Hu, X., Jiang, Y., Tang, K., Chen, J., Miao, C., & Zhang, H. (2020). Learning to segment the tail. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 14045\u201314054).","DOI":"10.1109\/CVPR42600.2020.01406"},{"key":"1622_CR64","doi-asserted-by":"crossref","unstructured":"Huang, C., Li, Y., Loy, C.\u00a0C., & Tang, X. (2016). Learning deep representation for imbalanced classification. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 5375\u20135384).","DOI":"10.1109\/CVPR.2016.580"},{"key":"1622_CR65","doi-asserted-by":"publisher","first-page":"2781","DOI":"10.1109\/TPAMI.2019.2914680","volume":"42","author":"C Huang","year":"2019","unstructured":"Huang, C., Li, Y., Loy, C. C., & Tang, X. (2019). Deep imbalanced learning for face recognition and attribute prediction. IEEE Transactions on Pattern Analysis and Machine Intelligence, 42, 2781\u20132794.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1622_CR66","unstructured":"Inaturalist (2018). Competition dataset. https:\/\/github.com\/visipedia\/inat_comp\/tree\/master\/2018"},{"key":"1622_CR67","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1162\/neco.1991.3.1.79","volume":"3","author":"RA Jacobs","year":"1991","unstructured":"Jacobs, R. A., Jordan, M. I., Nowlan, S. J., & Hinton, G. E. (1991). Adaptive mixtures of local experts. Neural Computation, 3, 79\u201387.","journal-title":"Neural Computation"},{"key":"1622_CR68","doi-asserted-by":"crossref","unstructured":"Jamal, M.\u00a0A., Brown, M., Yang, M.\u00a0H., Wang, L., & Gong, B. (2020). Rethinking class-balanced methods for long-tailed visual recognition from a domain adaptation perspective. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 7610\u20137619).","DOI":"10.1109\/CVPR42600.2020.00763"},{"key":"1622_CR69","doi-asserted-by":"publisher","first-page":"29","DOI":"10.4103\/2153-3539.186902","volume":"7","author":"A Janowczyk","year":"2016","unstructured":"Janowczyk, A., & Madabhushi, A. (2016). Deep learning for digital pathology image analysis: A comprehensive tutorial with selected use cases. Journal of Pathology Informatics, 7, 29.","journal-title":"Journal of Pathology Informatics"},{"key":"1622_CR70","doi-asserted-by":"publisher","first-page":"429","DOI":"10.3233\/IDA-2002-6504","volume":"6","author":"N Japkowicz","year":"2002","unstructured":"Japkowicz, N., & Stephen, S. (2002). The class imbalance problem: A systematic study. Intelligent Data Analysis, 6, 429\u2013449.","journal-title":"Intelligent Data Analysis"},{"key":"1622_CR71","doi-asserted-by":"crossref","unstructured":"Jiang, H., Misra, I., Rohrbach, M., Learned-Miller, E., & Chen, X. (2020). In defense of grid features for visual question answering. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"1622_CR72","doi-asserted-by":"publisher","first-page":"181","DOI":"10.1162\/neco.1994.6.2.181","volume":"6","author":"MI Jordan","year":"1994","unstructured":"Jordan, M. I., & Jacobs, R. A. (1994). Hierarchical mixtures of experts and the EM algorithm. Neural Comput, 6, 181\u2013214.","journal-title":"Neural Comput"},{"key":"1622_CR73","doi-asserted-by":"publisher","first-page":"237","DOI":"10.1613\/jair.301","volume":"4","author":"LP Kaelbling","year":"1996","unstructured":"Kaelbling, L. P., Littman, M. L., & Moore, A. W. (1996). Reinforcement learning: A survey. Journal of Artificial Intelligence Research, 4, 237\u2013285.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"1622_CR74","doi-asserted-by":"publisher","first-page":"263","DOI":"10.1287\/opre.1.5.263","volume":"1","author":"H Kahn","year":"1953","unstructured":"Kahn, H., & Marshall, A. W. (1953). Methods of reducing sample size in Monte Carlo computations. Journal of the Operations Research Society of America, 1, 263\u2013278.","journal-title":"Journal of the Operations Research Society of America"},{"key":"1622_CR75","doi-asserted-by":"publisher","first-page":"719","DOI":"10.2307\/1911684","volume":"45","author":"NC Kakwani","year":"1977","unstructured":"Kakwani, N. C. (1977). Applications of Lorenz curves in economic analysis. Econometrica, 45, 719\u2013727.","journal-title":"Econometrica"},{"key":"1622_CR76","unstructured":"Kang, B., Xie, S., Rohrbach, M., Yan, Z., Gordo, A., Feng, J., & Kalantidis, Y. (2020). Decoupling representation and classifier for long-tailed recognition. In International conference on learning representations."},{"key":"1622_CR77","doi-asserted-by":"crossref","unstructured":"Karras, T., Samuli, L., & Timo, A. (2019). A style-based generator architecture for generative adversarial networks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4401\u20134410).","DOI":"10.1109\/CVPR.2019.00453"},{"key":"1622_CR78","doi-asserted-by":"crossref","unstructured":"Kim, J., Jeong, J., & Shin, J. (2020). M2m: Imbalanced classification via major-to-minor translation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 13896\u201313905).","DOI":"10.1109\/CVPR42600.2020.01391"},{"key":"1622_CR79","doi-asserted-by":"crossref","unstructured":"Kim, D.\u00a0J., Sun, X., Choi, J., Lin, S., & Kweon, I.\u00a0S. (2020). Detecting human-object interactions with action co-occurrence priors. In Proceedings of the European conference on computer vision (pp. 718\u2013736)","DOI":"10.1007\/978-3-030-58589-1_43"},{"key":"1622_CR80","unstructured":"Kingma, D.\u00a0P., & Welling, M. (2013). Auto-encoding variational bayes. arXiv:1312.6114"},{"key":"1622_CR81","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., & Dollar, P. (2019). Panoptic feature pyramid networks. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 6399\u20136408).","DOI":"10.1109\/CVPR.2019.00656"},{"key":"1622_CR82","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., et al. (2017). Visual genome: Connecting language and vision using crowdsourced dense image annotations. International Journal of Computer Vision, 123, 32\u201373.","journal-title":"International Journal of Computer Vision"},{"key":"1622_CR83","unstructured":"Krizhevsky, A., & Hinton, G. (2009). Learning multiple layers of features from tiny images. Tech Report."},{"key":"1622_CR84","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A., Rom, H., Alldrin, N., Uijlings, J., Krasin, I., Pont-Tuset, J., et al. (2020). The open images dataset v4. International Journal of Computer Vision, 128, 1956\u20131981.","journal-title":"International Journal of Computer Vision"},{"key":"1622_CR85","doi-asserted-by":"crossref","unstructured":"Lample, G., Ott, M., Conneau, A., Denoyer, L., & Ranzato, M. (2018). Phrase-based and neural unsupervised machine translation. arXiv:1804.07755","DOI":"10.18653\/v1\/D18-1549"},{"key":"1622_CR86","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). Albert: A lite bert for self-supervised learning of language representations. arXiv:1909.11942"},{"key":"1622_CR87","doi-asserted-by":"crossref","unstructured":"Levi, G., & Hassner, T. (2015). Age and gender classification using convolutional neural networks. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 34\u201342).","DOI":"10.1109\/CVPRW.2015.7301352"},{"key":"1622_CR88","unstructured":"Li, T., Cao, P., Yuan, Y., Fan, L., Yang, Y., Feris, R., Indyk, P., & Katabi, D. (2021). Targeted supervised contrastive learning for long-tailed recognition. arXiv:2111.13998"},{"key":"1622_CR89","doi-asserted-by":"crossref","unstructured":"Li, Z., Dekel, T., Cole, F., Tucker, R., Snavely, N., Liu, C., & Freeman, W.\u00a0T. (2019). Learning the depths of moving people by watching frozen people. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4521\u20134530).","DOI":"10.1109\/CVPR.2019.00465"},{"key":"1622_CR90","doi-asserted-by":"crossref","unstructured":"Li, S., Gong, K., Liu, C.\u00a0H., Wang, Y., Qiao, F., & Cheng, X. (2021). Metasaug: Meta semantic augmentation for long-tailed visual recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 5212\u20135221).","DOI":"10.1109\/CVPR46437.2021.00517"},{"key":"1622_CR91","doi-asserted-by":"crossref","unstructured":"Li, B., Liu, Y., & Wang, X. (2019). Gradient harmonized single-stage detector. In Proceedings of the AAAI conference on artificial intelligence (pp. 8577\u20138584).","DOI":"10.1609\/aaai.v33i01.33018577"},{"key":"1622_CR92","doi-asserted-by":"crossref","unstructured":"Li, J., Tang, S., Li, J., Xiao, J., Wu, F., Pu, S., & Zhuang, Y. (2020). Topic adaptation and prototype encoding for few-shot visual storytelling. In Proceedings of the ACM international conference on multimedia (pp. 4208\u20134216).","DOI":"10.1145\/3394171.3413886"},{"key":"1622_CR93","doi-asserted-by":"crossref","unstructured":"Li, T., Wang, L., & Wu, G. (2021). Self supervision to distillation for long-tailed visual recognition. arXiv:2109.04075","DOI":"10.1109\/ICCV48922.2021.00067"},{"key":"1622_CR94","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, T., Kang, B., Tang, S., Wang, C., Li, J., & Feng, J. (2020). Overcoming classifier imbalance for long-tail object detection with balanced group softmax. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 10991\u201311000).","DOI":"10.1109\/CVPR42600.2020.01100"},{"key":"1622_CR95","doi-asserted-by":"crossref","unstructured":"Li, X., Wei, T., Chen, Y.\u00a0P., Tai, Y.\u00a0W., & Tang, C.\u00a0K. (2020). Fss-1000: A 1000-class dataset for few-shot segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR42600.2020.00294"},{"key":"1622_CR96","doi-asserted-by":"crossref","unstructured":"Li, B., Yao, Y., Tan, J., Zhang, G., Yu, F., Lu, J., & Luo, Y. (2022). Equalized focal loss for dense long-tailed object detection. arXiv:2201.02593","DOI":"10.1109\/CVPR52688.2022.00686"},{"key":"1622_CR97","doi-asserted-by":"crossref","unstructured":"Lin, T.\u00a0Y., Goyal, P., Girshick, R., He, K., & Doll\u00e1r, P. (2017). Focal loss for dense object detection. In Proceedings of the IEEE international conference on computer vision (pp. 2980\u20132988).","DOI":"10.1109\/ICCV.2017.324"},{"key":"1622_CR98","doi-asserted-by":"crossref","unstructured":"Lin, T.\u00a0Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.\u00a0L. (2014). Microsoft coco: Common objects in context. In Proceedings of the European conference on computer vision (pp. 740\u2013755).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1622_CR99","doi-asserted-by":"crossref","unstructured":"Liu, T.\u00a0Y. (2011). Learning to rank for information retrieval.","DOI":"10.1007\/978-3-642-14267-3"},{"key":"1622_CR100","doi-asserted-by":"crossref","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C.\u00a0Y., & Berg, A.\u00a0C. (2016). Ssd: Single shot multibox detector. In Proceedings of the European conference on computer vision (pp. 21\u201337).","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"1622_CR101","doi-asserted-by":"crossref","unstructured":"Liu, B., Li, H., Kang, H., & Hua, G. (2021). Gistet: A geometric structure transfer network for long-tailed recognition. arXiv:2105.00131","DOI":"10.1109\/ICCV48922.2021.00810"},{"key":"1622_CR102","unstructured":"Liu, B., Li, H., Kang, H., Hua, G., & Vasconcelos, N. (2021). Breadcrumbs: Adversarial class-balanced sampling for long-tailed recognition. arXiv:2105.00127"},{"key":"1622_CR103","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 10012\u201310022).","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"1622_CR104","doi-asserted-by":"crossref","unstructured":"Liu, Z., Miao, Z., Zhan, X., Wang, J., Gong, B., & Yu, S.\u00a0X. (2019). Large-scale long-tailed recognition in an open world. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 2537\u20132546).","DOI":"10.1109\/CVPR.2019.00264"},{"key":"1622_CR105","doi-asserted-by":"crossref","unstructured":"Liu, J., Sun, Y., Han, C., Dou, Z., & Li, W. (2020). Deep representation learning on long-tailed data: A learnable embedding augmentation perspective. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 2970\u20132979).","DOI":"10.1109\/CVPR42600.2020.00304"},{"key":"1622_CR106","unstructured":"Liu, J., Zhang, J., Li, W., Zhang, C., & Sun, Y. (2020). Memory-based jitter: Improving visual recognition on long-tailed data with diversity in memory. arXiv:2008.09809"},{"key":"1622_CR107","first-page":"539","volume":"39","author":"XY Liu","year":"2008","unstructured":"Liu, X. Y., Wu, J., & Zhou, Z. H. (2008). Exploratory undersampling for class-imbalance learning. IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), 39, 539\u2013550.","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics)"},{"key":"1622_CR108","unstructured":"Lvis Challenge (2019). https:\/\/www.lvisdataset.org\/"},{"key":"1622_CR109","unstructured":"Madry, A., Makelov, A., Schmidt, L., Tsipras, & D., Vladu, A. (2018). Towards deep learning models resistant to adversarial attacks. In International conference on learning representations."},{"key":"1622_CR110","doi-asserted-by":"crossref","unstructured":"Mahajan, D., Girshick, R., Ramanathan, V., He, K., Paluri, M., Li, Y., Bharambe, A., & Van Der\u00a0Maaten, L. (2018). Exploring the limits of weakly supervised pretraining. In Proceedings of the European conference on computer vision (pp. 181\u2013196).","DOI":"10.1007\/978-3-030-01216-8_12"},{"key":"1622_CR111","unstructured":"Mani, I., & Zhang, I. (2003). KNN approach to unbalanced data distributions: A case study involving information extraction. In Proceedings of workshop on learning from imbalanced datasets vol. 126. ICML United States."},{"key":"1622_CR112","doi-asserted-by":"publisher","first-page":"275","DOI":"10.1007\/s10462-012-9338-y","volume":"42","author":"S Masoudnia","year":"2014","unstructured":"Masoudnia, S., & Ebrahimpour, R. (2014). Mixture of experts: A literature survey. Artificial Intelligence Review, 42, 275\u2013293.","journal-title":"Artificial Intelligence Review"},{"key":"1622_CR113","unstructured":"Menon, A.\u00a0K., Jayasumana, S., Rawat, A.\u00a0S., Jain, H., Veit, A., & Kumar, S. (2021). Long-tail learning via logit adjustment. In International conference on learning representations."},{"key":"1622_CR114","doi-asserted-by":"crossref","unstructured":"Miao, J., Wei, Y., Wu, Y., Liang, C., Li, G., & Yang, Y. (2021). Vspw: A large-scale dataset for video scene parsing in the wild. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4133\u20134143).","DOI":"10.1109\/CVPR46437.2021.00412"},{"key":"1622_CR115","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G., & Dean, J. (2013). Distributed representations of words and phrases and their compositionality. arXiv:1310.4546"},{"key":"1622_CR116","unstructured":"Narayanan, A., Chen, Y.\u00a0T., & Malla, S. (2018). Semi-supervised learning: Fusion of self-supervised, supervised learning, and multimodal cues for tactical driver behavior detection. arXiv:1807.00864"},{"key":"1622_CR117","doi-asserted-by":"crossref","unstructured":"Oh\u00a0Song, H., Xiang, Y., Jegelka, S., & Savarese, S. (2016). Deep metric learning via lifted structured feature embedding. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4004\u20134012).","DOI":"10.1109\/CVPR.2016.434"},{"key":"1622_CR118","doi-asserted-by":"publisher","first-page":"3388","DOI":"10.1109\/TPAMI.2020.2981890","volume":"43","author":"K Oksuz","year":"2020","unstructured":"Oksuz, K., Cam, B. C., Kalkan, S., & Akbas, E. (2020). Imbalance problems in object detection: A review. IEEE Transactions on Pattern Analysis and Machine Intelligence, 43, 3388\u20133415.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1622_CR119","doi-asserted-by":"crossref","unstructured":"Ouyang, W., Wang, X., Zhang, C., & Yang, X. (2016). Factors in finetuning deep model for object detection with long-tail distribution. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 864\u2013873).","DOI":"10.1109\/CVPR.2016.100"},{"key":"1622_CR120","doi-asserted-by":"crossref","unstructured":"Peng, J., Bu, X., Sun, M., Zhang, Z., Tan, T., & Yan, J. (2020). Large-scale object detection in the wild from imbalanced multi-labels. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9709\u20139718).","DOI":"10.1109\/CVPR42600.2020.00973"},{"key":"1622_CR121","doi-asserted-by":"crossref","unstructured":"Peng, Z., Huang, W., Guo, Z., Zhang, X., Jiao, J., & Ye, Q. (2021). Long-tailed distribution adaptation. In Proceedings of the ACM international conference on multimedia (pp. 3275\u20133282).","DOI":"10.1145\/3474085.3475479"},{"key":"1622_CR122","unstructured":"Prabhu, V., Kannan, A., Ravuri, M., Chablani, M., Sontag, D., & Amatriain, X. (2018). Prototypical clustering networks for dermatological disease diagnosis. arXiv:1811.03066"},{"key":"1622_CR123","unstructured":"Radford, A., Kim, J.\u00a0W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. arXiv:2103.00020"},{"key":"1622_CR124","doi-asserted-by":"crossref","unstructured":"Ramanathan, V., Wang, R., & Mahajan, D. (2020). Dlwl: Improving detection for lowshot classes with weakly labelled data. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9342\u20139352).","DOI":"10.1109\/CVPR42600.2020.00936"},{"key":"1622_CR125","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster R-CNN: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems vol.\u00a028 (pp. 91\u201399)."},{"key":"1622_CR126","unstructured":"Ren, J., Yu, C., Sheng, S., Ma, X., Zhao, H., Yi, S., & Li, H. (2020). Balanced meta-softmax for long-tailed visual recognition. In Advances in neural information processing systems."},{"key":"1622_CR127","unstructured":"Ren, M., Zeng, W., Yang, B., & Urtasun, R. (2018). Learning to reweight examples for robust deep learning. In International conference on machine learning (pp. 4334\u20134343). PMLR."},{"key":"1622_CR128","unstructured":"Riquelme, C., Puigcerver, J., Mustafa, B., Neumann, M., Jenatton, R., Pinto, A.\u00a0S., Keysers, D., & Houlsby, N. (2021). Scaling vision with sparse mixture of experts. arXiv:2106.05974"},{"key":"1622_CR129","doi-asserted-by":"crossref","unstructured":"Ristani, E., Solera, F., Zou, R.\u00a0S., Cucchiara, R., & Tomasi, C. (2016). Performance measures and a data set for multi-target, multi-camera tracking. In Proceedings of the European conference on computer vision (pp. 17\u201335).","DOI":"10.1007\/978-3-319-48881-3_2"},{"key":"1622_CR130","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., et al. (2015). Imagenet large scale visual recognition challenge. International Journal of Computer Vision, 115, 211\u2013252.","journal-title":"International Journal of Computer Vision"},{"key":"1622_CR131","doi-asserted-by":"crossref","unstructured":"Shaham, T.R., Dekel, T., & Michaeli, T. (2019). Singan: Learning a generative model from a single natural image. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 4570\u20134580).","DOI":"10.1109\/ICCV.2019.00467"},{"key":"1622_CR132","doi-asserted-by":"crossref","unstructured":"Shao, S., Li, Z., Zhang, T., Peng, C., Yu, G., Zhang, X., Li, J., & Sun, J. (2019). Objects365: A large-scale, high-quality dataset for object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 8430\u20138439).","DOI":"10.1109\/ICCV.2019.00852"},{"key":"1622_CR133","doi-asserted-by":"crossref","unstructured":"Shen, L., Lin, Z., & Huang, Q. (2016). Relay backpropagation for effective learning of deep convolutional neural networks. In Proceedings of the European conference on computer vision (pp. 467\u2013482).","DOI":"10.1007\/978-3-319-46478-7_29"},{"key":"1622_CR134","doi-asserted-by":"crossref","unstructured":"Shrivastava, A., Gupta, A., & Girshick, R. (2016). Training region-based object detectors with online hard example mining. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 761\u2013769).","DOI":"10.1109\/CVPR.2016.89"},{"key":"1622_CR135","doi-asserted-by":"crossref","unstructured":"Shu, X., Wang, X., Zang, X., Zhang, S., Chen, Y., Li, G., & Tian, Q. (2021). Large-scale spatio-temporal person re-identification: Algorithm and benchmark. arXiv:2105.15076","DOI":"10.1109\/TCSVT.2021.3128214"},{"key":"1622_CR136","unstructured":"Shu, J., Xie, Q., Yi, L., Zhao, Q., Zhou, S., Xu, Z., & Meng, D. (2019). Meta-weight-net: Learning an explicit mapping for sample weighting. In Advances in neural information processing systems vol.\u00a032 (pp. 1919\u20131930)."},{"key":"1622_CR137","doi-asserted-by":"crossref","unstructured":"Simard, P.\u00a0Y., LeCun, Y.\u00a0A., Denker, J.\u00a0S., & Victorri, B. (1998). Transformation invariance in pattern recognition\u2014tangent distance and tangent propagation. In Neural networks: Tricks of the trade (pp. 239\u2013274). Springer.","DOI":"10.1007\/3-540-49430-8_13"},{"key":"1622_CR138","unstructured":"Sinha, S., Ohashi, H., & Nakamura, K. (2020). Class-wise difficulty-balanced loss for solving class-imbalance. In Proceedings of the Asian conference on computer vision."},{"key":"1622_CR139","unstructured":"Sohn, K. (2016). Improved deep metric learning with multi-class n-pair loss objective. In Advances in neural information processing systems (pp. 1857\u20131865)."},{"key":"1622_CR140","unstructured":"Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. MIT Press."},{"key":"1622_CR141","doi-asserted-by":"crossref","unstructured":"Tan, J., Lu, X., Zhang, G., Yin, C., & Li, Q. (2021). Equalization loss v2: A new gradient balance approach for long-tailed object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 1685\u20131694).","DOI":"10.1109\/CVPR46437.2021.00173"},{"key":"1622_CR142","doi-asserted-by":"crossref","unstructured":"Tan, J., Wang, C., Li, B., Li, Q., Ouyang, W., Yin, C., & Yan, J. (2020). Equalization loss for long-tailed object recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 11662\u201311671).","DOI":"10.1109\/CVPR42600.2020.01168"},{"key":"1622_CR143","unstructured":"Tang, K., Huang, J., & Zhang, H. (2020). Long-tailed classification by keeping the good and removing the bad momentum causal effect. In Advances in neural information processing systems."},{"key":"1622_CR144","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., Shamma, D. A., Friedland, G., Elizalde, B., Ni, K., Poland, D., et al. (2016). Yfcc100m: The new data in multimedia research. Communications of the ACM, 59, 64\u201373.","journal-title":"Communications of the ACM"},{"key":"1622_CR145","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C., Chen, H., & He, T. (2019). Fcos: Fully convolutional one-stage object detection. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 9627\u20139636).","DOI":"10.1109\/ICCV.2019.00972"},{"key":"1622_CR146","unstructured":"van Steenkiste, S., Greff, K., & Schmidhuber, J. (2019). A perspective on objects and systematic generalization in model-based RL. arXiv:1906.01035"},{"key":"1622_CR147","unstructured":"van\u00a0den Oord, A., Vinyals, O., & Kavukcuoglu, K. (2017). Neural discrete representation learning. In Advances in neural information processing systems."},{"key":"1622_CR148","unstructured":"Van\u00a0Horn, G., & Perona, P. (2017). The devil is in the tails: Fine-grained classification in the wild. arXiv:1709.01450"},{"key":"1622_CR149","doi-asserted-by":"crossref","unstructured":"Van\u00a0Horn, G., Mac\u00a0Aodha, O., Song, Y., Cui, Y., Sun, C., Shepard, A., Adam, H., Perona, P., & Belongie, S. (2018). The inaturalist species classification and detection dataset. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 8769\u20138778).","DOI":"10.1109\/CVPR.2018.00914"},{"key":"1622_CR150","doi-asserted-by":"crossref","unstructured":"Wang, Y., Gan, W., Yang, J., Wu, W., & Yan, J. (2019). Dynamic curriculum learning for imbalanced data classification. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 5017\u20135026).","DOI":"10.1109\/ICCV.2019.00512"},{"key":"1622_CR151","unstructured":"Wang, C., Gao, S., Wang, P., Gao, G., Pei, W., Pan, L., & Xu, Z. (2021). Label-aware distribution calibration for long-tailed classification. arXiv:2111.04901"},{"key":"1622_CR152","doi-asserted-by":"crossref","unstructured":"Wang, P., Han, K., Wei, X.\u00a0S., Zhang, L., & Wang, L. (2021). Contrastive learning based hybrid networks for long-tailed image classification. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 943\u2013952).","DOI":"10.1109\/CVPR46437.2021.00100"},{"key":"1622_CR153","unstructured":"Wang, R., Hu, K., Zhu, Y., Shu, J., Zhao, Q., & Meng, D. (2020). Meta feature modulator for long-tailed recognition. arXiv:2008.03428"},{"key":"1622_CR154","doi-asserted-by":"crossref","unstructured":"Wang, T., Li, Y., Kang, B., Li, J., Liew, J., Tang, S., Hoi, S., & Feng, J. (2020). The devil is in classification: A simple framework for long-tail instance segmentation. In Proceedings of the European conference on computer vision (pp. 728\u2013744).","DOI":"10.1007\/978-3-030-58568-6_43"},{"key":"1622_CR155","unstructured":"Wang, X., Lian, L., Miao, Z., Liu, Z., & Yu, S.X. (2021) Long-tailed recognition by routing diverse distribution-aware experts. In International conference on learning representations."},{"key":"1622_CR156","unstructured":"Wang, T.\u00a0C., Liu, M.\u00a0Y., Zhu, J.\u00a0Y., Liu, G., Tao, A., Kautz, J., & Catanzaro, B. (2018). Video-to-video synthesis. In Advances in neural information processing systems (pp. 1152\u20131164)."},{"key":"1622_CR157","unstructured":"Wang, Y.\u00a0X., Ramanan, D., & Hebert, M. (2017). Learning to model the tail. In Advances in neural information processing systems (pp. 7029\u20137039)"},{"key":"1622_CR158","doi-asserted-by":"crossref","unstructured":"Wang, H., Wang, Y., Zhou, Z., Ji, X., Gong, D., Zhou, J., Li, Z., & Liu, W. (2018). Cosface: Large margin cosine loss for deep face recognition. In PProceedings of the IEEE conference on computer vision and pattern recognition (pp. 5265\u20135274).","DOI":"10.1109\/CVPR.2018.00552"},{"key":"1622_CR159","unstructured":"Wang, H., Xiao, C., Kossaifi, J., Yu, Z., Anandkumar, A., & Wang, Z. (2021). Augmax: Adversarial composition of random augmentations for robust training. In Advances in neural information processing systems."},{"key":"1622_CR160","unstructured":"Wang, Y., Yao, Q., Kwok, J., & Ni, L. (2019). Few-shot learning: A survey. arXiv:1904.05046"},{"key":"1622_CR161","unstructured":"Wang, Y., Zhang, B., Hou, W., Wu, Z., Wang, J., & Shinozaki, T. (2021). Margin calibration for long-tailed visual recognition. arXiv:2112.07225"},{"key":"1622_CR162","doi-asserted-by":"crossref","unstructured":"Wang, J., Zhang, W., Zang, Y., Cao, Y., Pang, J., Gong, T., Chen, K., Liu, Z., Loy, C.\u00a0C., Lin, D. (2021). Seesaw loss for long-tailed instance segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9695\u20139704).","DOI":"10.1109\/CVPR46437.2021.00957"},{"key":"1622_CR163","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhu, Y., Zhao, C., Zeng, W., Wang, J., & Tang, M. (2021). Adaptive class suppression loss for long-tail object detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 3103\u20133112).","DOI":"10.1109\/CVPR46437.2021.00312"},{"key":"1622_CR164","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1016\/j.asoc.2013.09.014","volume":"20","author":"KJ Wang","year":"2014","unstructured":"Wang, K. J., Makond, B., Chen, K. H., & Wang, K. M. (2014). A hybrid classifier combining SMOTE with PSO to estimate 5-year survivability of breast cancer patients. Applied Soft Computing, 20, 15\u201324.","journal-title":"Applied Soft Computing"},{"key":"1622_CR165","doi-asserted-by":"crossref","unstructured":"Wei, C., Sohn, K., Mellina, C., Yuille, A., & Yang, F. (2021). Crest: A class-rebalancing self-training framework for imbalanced semi-supervised learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 10857\u201310866).","DOI":"10.1109\/CVPR46437.2021.01071"},{"key":"1622_CR166","doi-asserted-by":"crossref","unstructured":"Weyand, T., Araujo, A., Cao, B., & Sim, J. (2020). Google landmarks dataset v2-a large-scale benchmark for instance-level recognition and retrieval. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 2575\u20132584).","DOI":"10.1109\/CVPR42600.2020.00265"},{"key":"1622_CR167","unstructured":"Wightman, R., Touvron, H., & Jegou, H. (2021). Resnet strikes back: An improved training procedure in timm. arXiv:2110.00476"},{"key":"1622_CR168","doi-asserted-by":"publisher","first-page":"408","DOI":"10.1109\/TSMC.1972.4309137","volume":"3","author":"DL Wilson","year":"1972","unstructured":"Wilson, D. L. (1972). Asymptotic properties of nearest neighbor rules using edited data. IEEE Transactions on Systems, Man, and Cybernetics, 3, 408\u2013421.","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics"},{"key":"1622_CR169","doi-asserted-by":"crossref","unstructured":"Wu, T., Huang, Q., Liu, Z., Wang, Y., & Lin, D. (2020). Distribution-balanced loss for multi-label classification in long-tailed datasets. In Proceedings of the European conference on computer vision (pp. 162\u2013178).","DOI":"10.1007\/978-3-030-58548-8_10"},{"key":"1622_CR170","unstructured":"Wu, Y., Kirillov, A., Massa, F., Lo, W.\u00a0Y., & Girshick, R. (2019). Detectron2. https:\/\/github.com\/facebookresearch\/detectron2"},{"key":"1622_CR171","doi-asserted-by":"crossref","unstructured":"Wu, T., Liu, Z., Huang, Q., Wang, Y., & Lin, D. (2021). Adversarial robustness under long-tailed distribution. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 8659\u20138668).","DOI":"10.1109\/CVPR46437.2021.00855"},{"key":"1622_CR172","doi-asserted-by":"crossref","unstructured":"Wu, J., Song, L., Wang, T., Zhang, Q., & Yuan, J. (2020). Forest R-CNN: Large-vocabulary long-tailed object detection and instance segmentation. In Proceedings of the ACM international conference on multimedia (pp. 1570\u20131578).","DOI":"10.1145\/3394171.3413970"},{"key":"1622_CR173","doi-asserted-by":"crossref","unstructured":"Xiang, L., Ding, G., & Han, J. (2020). Learning from multiple experts: Self-paced knowledge distillation for long-tailed classification. In Proceedings of the European conference on computer vision (pp. 247\u2013263).","DOI":"10.1007\/978-3-030-58558-7_15"},{"key":"1622_CR174","unstructured":"Yang, Y., & Xu, Z. (2020). Rethinking the value of labels for improving class-imbalanced learning. In Advances in neural information processing systems."},{"key":"1622_CR175","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R.\u00a0R., & Le, Q.\u00a0V. (2019). Xlnet: Generalized autoregressive pretraining for language understanding. In: Advances in neural information processing systems (pp. 5753\u20135763)."},{"key":"1622_CR176","doi-asserted-by":"publisher","first-page":"855","DOI":"10.1007\/s11042-020-09604-z","volume":"80","author":"L Yang","year":"2021","unstructured":"Yang, L., Song, Q., & Wu, Y. (2021). Attacks on state-of-the-art face recognition using attentional adversarial attack generative network. Multimedia Tools and Applications, 80, 855\u2013875.","journal-title":"Multimedia Tools and Applications"},{"key":"1622_CR177","unstructured":"Yaoyao, Z., & Weihong, D. (2019). Adversarial learning with margin-based triplet embedding regularization. In Proceedings of the IEEE\/CVF international conference on computer vision"},{"key":"1622_CR178","doi-asserted-by":"crossref","unstructured":"Yitzhaki, S., & Schechtman, E. (2013). More than a dozen alternative ways of spelling Gini. In The Gini Methodology (pp. 11\u201331).","DOI":"10.1007\/978-1-4614-4720-7_2"},{"key":"1622_CR179","doi-asserted-by":"crossref","unstructured":"Yu, W., Yang, T., & Chen, C. (2021). Towards resolving the challenge of long-tail distribution in UAV images for object detection. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision (pp. 3258\u20133267).","DOI":"10.1109\/WACV48630.2021.00330"},{"key":"1622_CR180","doi-asserted-by":"crossref","unstructured":"Zang, Y., Huang, C., & Loy, C.\u00a0C. (2021). Fasa: Feature augmentation and sampling adaptation for long-tailed instance segmentation. arXiv:2102.12867","DOI":"10.1109\/ICCV48922.2021.00344"},{"key":"1622_CR181","doi-asserted-by":"crossref","unstructured":"Zeng, A., Sun, X., Huang, F., Liu, M., Xu, Q., & Lin, S. (2020). Srnet: Improving generalization in 3d human pose estimation with a split-and-recombine approach. In Proceedings of the European conference on computer vision (pp. 507\u2013523).","DOI":"10.1007\/978-3-030-58568-6_30"},{"key":"1622_CR182","unstructured":"Zhang, S., Chen, C., Hu, X., & Peng, S. (2021). Balanced knowledge distillation for long-tailed learning. arXiv:2104.10510"},{"key":"1622_CR183","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Cheng, D.Z., Yao, T., Yi, X., Hong, L., & Chi, E.H. (2021). A model of two tales: Dual transfer learning framework for improved long-tail item recommendation. In Proceedings of the web conference 2021 (pp. 2220\u20132231).","DOI":"10.1145\/3442381.3450086"},{"key":"1622_CR184","unstructured":"Zhang, H., Cisse, M., Dauphin, Y.\u00a0N., & Lopez-Paz, D. (2018). mixup: Beyond empirical risk minimization. In International conference on learning representations."},{"key":"1622_CR185","doi-asserted-by":"crossref","unstructured":"Zhang, X., Fang, Z., Wen, Y., Li, Z., & Qiao, Y. (2017). Range loss for deep face recognition with long-tailed training data. In Proceedings of the IEEE international conference on computer vision (pp. 5409\u20135418)","DOI":"10.1109\/ICCV.2017.578"},{"key":"1622_CR186","unstructured":"Zhang, Y., Kang, B., Hooi, B., Yan, S., & Feng, J. (2021). Deep long-tailed learning: A survey. arXiv:2110.04596"},{"key":"1622_CR187","doi-asserted-by":"crossref","unstructured":"Zhang, P., Li, X., Hu, X., Yang, J., Zhang, L., Wang, L., Choi, Y., & Gao, J. (2021). Vinvl: Revisiting visual representations in vision-language models. arXiv:2101.00529","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"1622_CR188","doi-asserted-by":"crossref","unstructured":"Zhang, S., Li, Z., Yan, S., He, X., & Sun, J. (2021). Distribution alignment: A unified framework for long-tail visual recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 2361\u20132370).","DOI":"10.1109\/CVPR46437.2021.00239"},{"key":"1622_CR189","doi-asserted-by":"crossref","unstructured":"Zhang, G., Lu, X., Tan, J., Li, J., Zhang, Z., Li, Q., & Hu, X. (2021). Refinemask: Towards high-quality instance segmentation with fine-grained features. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 6861\u20136869).","DOI":"10.1109\/CVPR46437.2021.00679"},{"key":"1622_CR190","doi-asserted-by":"crossref","unstructured":"Zhang, C., Pan, T.\u00a0Y., Li, Y., Hu, H., Xuan, D., Changpinyo, S., Gong, B., & Chao, W.\u00a0L. (2021). A simple and effective use of object-centric images for long-tailed object detection. arXiv:2102.08884","DOI":"10.1109\/ICCV48922.2021.00047"},{"key":"1622_CR191","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Shi, Y., Yuan, C., Li, B., Wang, P., Hu, W., & Zha, Z.\u00a0J. (2020). Object relational graph with teacher-recommended learning for video captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 13278\u201313288).","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"1622_CR192","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Wei, X.\u00a0S., Zhou, B., & Wu, J. (2021). Bag of tricks for long-tailed visual recognition with deep convolutional neural networks. In Proceedings of the AAAI conference on artificial intelligence (pp. 3447\u20133455).","DOI":"10.1609\/aaai.v35i4.16458"},{"key":"1622_CR193","unstructured":"Zhao, Y., Chen, W., Tan, X., Huang, K., Xu, J., Wang, C., & Zhu, J. (2021). Improving long-tailed classification from instance level. arXiv:2104.06094"},{"key":"1622_CR194","doi-asserted-by":"crossref","unstructured":"Zhao, J., Li, J., Cheng, Y., Zhou, L., Sim, T., Yan, S., & Feng, J. (2018). Understanding humans in crowded scenes: Deep nested adversarial learning and a new benchmark for multi-human parsing. In Proceedings of the ACM international conference on multimedia (pp. 792\u2013800).","DOI":"10.1145\/3240508.3240509"},{"key":"1622_CR195","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., & Jia, J. (2017). Pyramid scene parsing network. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2881\u20132890).","DOI":"10.1109\/CVPR.2017.660"},{"key":"1622_CR196","doi-asserted-by":"crossref","unstructured":"Zheng, L., Shen, L., Tian, L., Wang, S., Wang, J., & Tian, Q. (2015). Scalable person re-identification: A benchmark. In Proceedings of the IEEE international conference on computer vision (pp. 1116\u20131124).","DOI":"10.1109\/ICCV.2015.133"},{"key":"1622_CR197","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Cui, J., Liu, S., & Jia, J. (2021). Improving calibration for long-tailed recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 16489\u201316498).","DOI":"10.1109\/CVPR46437.2021.01622"},{"key":"1622_CR198","doi-asserted-by":"crossref","unstructured":"Zhou, B., Cui, Q., Wei, X.\u00a0S., & Chen, Z.\u00a0M. (2020). Bbn: Bilateral-branch network with cumulative learning for long-tailed visual recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9719\u20139728).","DOI":"10.1109\/CVPR42600.2020.00974"},{"key":"1622_CR199","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., & Torralba, A. (2016). Learning deep features for discriminative localization. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2921\u20132929)","DOI":"10.1109\/CVPR.2016.319"},{"key":"1622_CR200","unstructured":"Zhou, X., Koltun, V., & Kr\u00e4henb\u00fchl, P. (2021). Probabilistic two-stage detection. arXiv:2103.07461"},{"key":"1622_CR201","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., & Torralba, A. (2017). Scene parsing through ade20k dataset. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 633\u2013641)","DOI":"10.1109\/CVPR.2017.544"},{"key":"1622_CR202","doi-asserted-by":"publisher","first-page":"1452","DOI":"10.1109\/TPAMI.2017.2723009","volume":"40","author":"B Zhou","year":"2017","unstructured":"Zhou, B., Lapedriza, A., Khosla, A., Oliva, A., & Torralba, A. (2017). Places: A 10 million image database for scene recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, 40, 1452\u20131464.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1622_CR203","doi-asserted-by":"crossref","unstructured":"Zou, Y., Yu, Z., Kumar, B., & Wang, J. (2018). Unsupervised domain adaptation for semantic segmentation via class-balanced self-training. In Proceedings of the European conference on computer vision (pp. 289\u2013305).","DOI":"10.1007\/978-3-030-01219-9_18"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01622-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-022-01622-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01622-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,6]],"date-time":"2023-02-06T04:14:01Z","timestamp":1675656841000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-022-01622-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,25]]},"references-count":203,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2022,7]]}},"alternative-id":["1622"],"URL":"https:\/\/doi.org\/10.1007\/s11263-022-01622-8","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,5,25]]},"assertion":[{"value":"11 November 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 April 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 May 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}