{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T08:09:05Z","timestamp":1779350945668,"version":"3.51.4"},"reference-count":80,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100018919","name":"Peng Cheng Laboratory","doi-asserted-by":"publisher","award":["PCL2023A08"],"award-info":[{"award-number":["PCL2023A08"]}],"id":[{"id":"10.13039\/100018919","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s11263-026-02741-2","type":"journal-article","created":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T10:10:32Z","timestamp":1772791832000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["AEMIM: Adversarial Examples Meet Masked Image Modeling"],"prefix":"10.1007","volume":"134","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3730-8848","authenticated-orcid":false,"given":"Wenzhao","family":"Xiang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hang","family":"Su","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongyang","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,3,6]]},"reference":[{"key":"2741_CR1","doi-asserted-by":"crossref","unstructured":"Assran, M., Duval, Q., Misra, I., Bojanowski, P., Vincent, P., Rabbat, M., LeCun, Y., & Ballas, N. (2023). Self-supervised learning from images with a joint-embedding predictive architecture. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 15619\u201315629.","DOI":"10.1109\/CVPR52729.2023.01499"},{"key":"2741_CR2","unstructured":"Baevski, A., Hsu, W.-N., Xu, Q., Babu, A., Gu, J., & Auli, M. (2022). Data2vec: A general framework for self-supervised learning in speech, vision and language. In International conference on machine learning, pp. 1298\u20131312. PMLR"},{"key":"2741_CR3","unstructured":"Bao, H., Dong, L., Piao, S., & Wei, F. (2021). Beit: Bert pre-training of image transformers. In International conference on learning representations"},{"key":"2741_CR4","doi-asserted-by":"crossref","unstructured":"Bar\u00a0Tal, O., Haviv, A., & Bermano, A.H. (2023). Omg-attack: Self-supervised on-manifold generation of transferable evasion attacks. In Proceedings of the IEEE\/CVF international conference on computer vision (ICCV) workshops, pp. 3696\u20133706.","DOI":"10.1109\/ICCVW60793.2023.00397"},{"key":"2741_CR5","unstructured":"Beyer, L., H\u00e9naff, O.J., Kolesnikov, A., Zhai, X., & Oord, A.v.d. (2020). Are we done with imagenet? arXiv preprint arXiv:2006.07159."},{"key":"2741_CR6","doi-asserted-by":"crossref","unstructured":"Brempong, E.A., Kornblith, S., Chen, T., Parmar, N., Minderer, M., & Norouzi, M. (2022). Denoising pretraining for semantic segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (cvpr) workshops, pp. 4175\u20134186.","DOI":"10.1109\/CVPRW56347.2022.00462"},{"key":"2741_CR7","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. (2020). Language models are few-shot learners. Advances in Neural Information Processing Systems, 33, 1877\u20131901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2741_CR8","doi-asserted-by":"crossref","unstructured":"Carlini, N., & Wagner, D. (2017). Towards evaluating the robustness of neural networks. In 2017 Ieee symposium on security and privacy (sp), pp. 39\u201357. Ieee","DOI":"10.1109\/SP.2017.49"},{"key":"2741_CR9","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., & Joulin, A. (2021). Emerging properties in self-supervised vision transformers. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 9650\u20139660.","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"2741_CR10","unstructured":"Chen, X., Fan, H., & Girshick, R., He, K. (2020). Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297"},{"key":"2741_CR11","unstructured":"Chen, T., Kornblith, S., Norouzi, M., & Hinton, G. (2020). A simple framework for contrastive learning of visual representations. In International conference on machine learning, pp. 1597\u20131607. PMLR"},{"key":"2741_CR12","unstructured":"Chen, K., Wang, J., Pang, J., Cao, Y., Xiong, Y., Li, X., Sun, S., Feng, W., Liu, Z., Xu, J., et al. (2019). Mmdetection: Open mmlab detection toolbox and benchmark. arXiv preprint arXiv:1906.07155."},{"key":"2741_CR13","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., & He, K. (2021). An empirical study of training self-supervised vision transformers. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 9640\u20139649","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"2741_CR14","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, C., Tan, M., Zhang, L., Hsieh, C.-J., & Gong, B. (2021). Robust and accurate object detection via adversarial learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 16622\u201316631.","DOI":"10.1109\/CVPR46437.2021.01635"},{"key":"2741_CR15","first-page":"22243","volume":"33","author":"T Chen","year":"2020","unstructured":"Chen, T., Kornblith, S., Swersky, K., Norouzi, M., & Hinton, G. E. (2020). Big self-supervised models are strong semi-supervised learners. Advances in Neural Information Processing Systems, 33, 22243\u201322255.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2741_CR16","unstructured":"Contributors, M. (2020). MMSegmentation: OpenMMLab semantic segmentation toolbox and benchmark. https:\/\/github.com\/open-mmlab\/mmsegmentation."},{"key":"2741_CR17","doi-asserted-by":"crossref","unstructured":"Cubuk, E.D., Zoph, B., Shlens, J., & Le, Q.V. (2020). Randaugment: Practical automated data augmentation with a reduced search space. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops, pp. 702\u2013703.","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"2741_CR18","doi-asserted-by":"crossref","unstructured":"Dalal, N., & Triggs, B. (2005). Histograms of oriented gradients for human detection. In 2005 IEEE computer society conference on computer vision and pattern recognition (CVPR\u201905), vol. 1, pp. 886\u2013893. Ieee","DOI":"10.1109\/CVPR.2005.177"},{"key":"2741_CR19","doi-asserted-by":"crossref","unstructured":"Dong, Y., Liao, F., Pang, T., Su, H., Zhu, J., Hu, X., & Li, J. (2018). Boosting adversarial attacks with momentum. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 9185\u20139193.","DOI":"10.1109\/CVPR.2018.00957"},{"key":"2741_CR20","doi-asserted-by":"crossref","unstructured":"Dong, Y., Pang, T., Su, H., & Zhu, J. (2019). Evading defenses to transferable adversarial examples by translation-invariant attacks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 4312\u20134321.","DOI":"10.1109\/CVPR.2019.00444"},{"key":"2741_CR21","doi-asserted-by":"publisher","first-page":"552","DOI":"10.1609\/aaai.v37i1.25130","volume":"37","author":"X Dong","year":"2023","unstructured":"Dong, X., Bao, J., Zhang, T., Chen, D., Zhang, W., Yuan, L., Chen, D., Wen, F., Yu, N., & Guo, B. (2023). Peco: Perceptual codebook for bert pre-training of vision transformers. Proceedings of the AAAI Conference on Artificial Intelligence, 37, 552\u2013560.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2741_CR22","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., & Houlsby, N. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In International conference on learning representations."},{"key":"2741_CR23","unstructured":"Fang, Y., Dong, L., Bao, H., Wang, X., & Wei, F. (2023). Corrupted image modeling for self-supervised visual pre-training. In The eleventh international conference on learning representations."},{"key":"2741_CR24","doi-asserted-by":"crossref","unstructured":"Gao, P., Lin, Z., Zhang, R., Fang, R., Li, H., Li, H., & Qiao, Y. (2023). Mimic before reconstruct: Enhancing masked autoencoders with feature mimicking. International Journal of Computer Vision, 1\u201311.","DOI":"10.1007\/s11263-023-01898-4"},{"key":"2741_CR25","unstructured":"Glorot, X., & Bengio, Y. (2010). Understanding the difficulty of training deep feedforward neural networks. In Proceedings of the thirteenth international conference on artificial intelligence and statistics, pp. 249\u2013256. JMLR Workshop and Conference Proceedings."},{"key":"2741_CR26","unstructured":"Goodfellow, I.J., Shlens, J., & Szegedy, C. (2014). Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572."},{"key":"2741_CR27","unstructured":"Haghighat, M., Moghadam, P., Mohamed, S., & Koniusz, P. (2023). Pre-training with random orthogonal projection image modeling. arXiv preprint arXiv:2310.18737"},{"key":"2741_CR28","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., & Girshick, R. (2022). Masked autoencoders are scalable vision learners. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 16000\u201316009","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2741_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., & Girshick, R. (2020). Momentum contrast for unsupervised visual representation learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 9729\u20139738.","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2741_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask r-cnn. In Proceedings of the IEEE international conference on computer vision, pp. 2961\u20132969.","DOI":"10.1109\/ICCV.2017.322"},{"key":"2741_CR31","unstructured":"Hendrycks, D., & Dietterich, T. (2019). Benchmarking neural network robustness to common corruptions and perturbations. In International conference on learning representations"},{"key":"2741_CR32","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Basart, S., Mu, N., Kadavath, S., Wang, F., Dorundo, E., Desai, R., Zhu, T., Parajuli, S., Guo, M., et al. (2021). The many faces of robustness: A critical analysis of out-of-distribution generalization. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 8340\u20138349.","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"2741_CR33","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., Steinhardt, J., & Song, D. (2021). Natural adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15262\u201315271.","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"2741_CR34","unstructured":"Hou, Z., Sun, F., Chen, Y.-K., Xie, Y., & Kung, S.-Y. (2022). Milan: Masked image pretraining on language assisted representation. arXiv preprint arXiv:2208.06049."},{"key":"2741_CR35","doi-asserted-by":"crossref","unstructured":"Huang, G., Sun, Y., Liu, Z., Sedra, D., & Weinberger, K.Q. (2016). Deep networks with stochastic depth. In Computer Vision\u2013ECCV 2016: 14th european conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14, pp. 646\u2013661. Springer","DOI":"10.1007\/978-3-319-46493-0_39"},{"issue":"1","key":"2741_CR36","doi-asserted-by":"publisher","first-page":"160","DOI":"10.1007\/s11263-022-01701-w","volume":"131","author":"E Kazemi","year":"2023","unstructured":"Kazemi, E., Kerdreux, T., & Wang, L. (2023). Minimally distorted structured adversarial attacks. International Journal of Computer Vision, 131(1), 160\u2013176.","journal-title":"International Journal of Computer Vision"},{"key":"2741_CR37","unstructured":"Kenton, J.D.M.-W.C., & Toutanova, L.K. (2019). Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of NAACL-HLT, pp. 4171\u20134186."},{"key":"2741_CR38","first-page":"2983","volume":"33","author":"M Kim","year":"2020","unstructured":"Kim, M., Tack, J., & Hwang, S. J. (2020). Adversarial self-supervised contrastive learning. Advances in Neural Information Processing Systems, 33, 2983\u20132994.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2741_CR39","doi-asserted-by":"crossref","unstructured":"Li, W., Xie, J., & Loy, C.C. (2023). Correlational image modeling for self-supervised visual pre-training. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 15105\u201315115.","DOI":"10.1109\/CVPR52729.2023.01450"},{"key":"2741_CR40","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.L. (2014). Microsoft coco: Common objects in context. In Computer Vision\u2013ECCV 2014: 13th european conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2741_CR41","unstructured":"Lin, J., Song, C., He, K., Wang, L., & Hopcroft, J.E. (2020). Nesterov accelerated gradient and scale invariance for adversarial attacks. In International conference on learning representations."},{"key":"2741_CR42","unstructured":"Liu, C., Dong, Y., Xiang, W., Yang, X., Su, H., Zhu, J., Chen, Y., He, Y., Xue, H., & Zheng, S. (2023). A comprehensive study on robustness of image classification models: Benchmarking and rethinking. arXiv preprint arXiv:2302.14301."},{"key":"2741_CR43","doi-asserted-by":"crossref","unstructured":"Liu, J., Huang, X., Zheng, J., Liu, Y., & Li, H. (2023). Mixmae: Mixed and masked autoencoder for efficient pretraining of hierarchical vision transformers. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6252\u20136261.","DOI":"10.1109\/CVPR52729.2023.00605"},{"key":"2741_CR44","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2741_CR45","doi-asserted-by":"crossref","unstructured":"Liu, C., Xiang, W., He, Y., Xue, H., Zheng, S., & Su, H. (2023). Improving model generalization by on-manifold adversarial augmentation in the frequency domain. arXiv preprint arXiv:2302.14302","DOI":"10.2139\/ssrn.4768680"},{"key":"2741_CR46","unstructured":"Loshchilov, I., & Hutter, F. (2017). SGDR: Stochastic gradient descent with warm restarts. In International conference on learning representations."},{"key":"2741_CR47","unstructured":"Loshchilov, I., & Hutter, F. (2019). Decoupled weight decay regularization. In International conference on learning representations."},{"issue":"11","key":"2741_CR48","first-page":"2579","volume":"9","author":"L Maaten","year":"2008","unstructured":"Maaten, L., & Hinton, G. (2008). Visualizing data using t-sne. Journal of Machine Learning Research, 9(11), 2579\u20132605.","journal-title":"Journal of Machine Learning Research"},{"key":"2741_CR49","unstructured":"Madry, A., Makelov, A., Schmidt, L., Tsipras, D., & Vladu, A. (2018). Towards deep learning models resistant to adversarial attacks. In International conference on learning representations."},{"key":"2741_CR50","unstructured":"Mei, J., Han, Y., Bai, Y., Zhang, Y., Li, Y., Li, X., Yuille, A., & Xie, C. (2022). Fast advprop. In International conference on learning representations."},{"key":"2741_CR51","unstructured":"Oord, A.v.d., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748"},{"key":"2741_CR52","unstructured":"Oquab, M., Darcet, T., Moutakanni, T., Vo, H., Szafraniec, M., Khalidov, V., Fernandez, P., Haziza, D., Massa, F., El-Nouby, A., et al. (2023). Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193."},{"key":"2741_CR53","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, pp. 8748\u20138763. PmLR"},{"key":"2741_CR54","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et al. (2018). Improving language understanding by generative pre-training"},{"issue":"8","key":"2741_CR55","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al. (2019). Language models are unsupervised multitask learners. OpenAI blog, 1(8), 9.","journal-title":"OpenAI blog"},{"key":"2741_CR56","unstructured":"Rebuffi, S.-A., Croce, F., & Gowal, S. (2023). Revisiting adapters with adversarial training. In The eleventh international conference on learning representations."},{"key":"2741_CR57","unstructured":"Recht, B., Roelofs, R., Schmidt, L., & Shankar, V. (2019). Do imagenet classifiers generalize to imagenet? In International conference on machine learning, pp. 5389\u20135400. PMLR"},{"key":"2741_CR58","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., et al. (2015). Imagenet large scale visual recognition challenge. International Journal of Computer Vision, 115, 211\u2013252.","journal-title":"International Journal of Computer Vision"},{"key":"2741_CR59","unstructured":"Shi, Y., Siddharth, N., Torr, P., & Kosiorek, A.R. (2022). Adversarial masking for self-supervised learning. In International conference on machine learning, pp. 20026\u201320040. PMLR"},{"key":"2741_CR60","doi-asserted-by":"crossref","unstructured":"Stutz, D., Hein, M., & Schiele, B. (2019). Disentangling adversarial robustness and generalization. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2019.00714"},{"key":"2741_CR61","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., & Wojna, Z. (2016). Rethinking the inception architecture for computer vision. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 2818\u20132826.","DOI":"10.1109\/CVPR.2016.308"},{"key":"2741_CR62","unstructured":"Van Den\u00a0Oord, A., Vinyals, O., et al. (2017). Neural discrete representation learning. Advances in neural information processing systems 30."},{"key":"2741_CR63","unstructured":"Wang, H., Ge, S., Lipton, Z., & Xing, E.P. (2019). Learning robust global representations by penalizing local predictive power. Advances in Neural Information Processing Systems 32"},{"key":"2741_CR64","doi-asserted-by":"crossref","unstructured":"Wang, H., Song, K., Fan, J., Wang, Y., Xie, J., & Zhang, Z. (2023). Hard patches mining for masked image modeling. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 10375\u201310385.","DOI":"10.1109\/CVPR52729.2023.01000"},{"key":"2741_CR65","doi-asserted-by":"crossref","unstructured":"Wei, C., Fan, H., Xie, S., Wu, C.-Y., Yuille, A., & Feichtenhofer, C. (2022). Masked feature prediction for self-supervised visual pre-training. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 14668\u201314678.","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"2741_CR66","doi-asserted-by":"crossref","unstructured":"Wei, L., Xie, L., Zhou, W., Li, H., & Tian, Q. (2022). Mvp: Multimodality-guided visual pre-training. In European conference on computer vision, pp. 337\u2013353. Springer","DOI":"10.1007\/978-3-031-20056-4_20"},{"key":"2741_CR67","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103647","volume":"229","author":"W Xiang","year":"2023","unstructured":"Xiang, W., Su, H., Liu, C., Guo, Y., & Zheng, S. (2023). Improving the robustness of adversarial attacks using an affine-invariant gradient estimator. Computer Vision and Image Understanding, 229, Article 103647.","journal-title":"Computer Vision and Image Understanding"},{"key":"2741_CR68","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., & Sun, J. (2018). Unified perceptual parsing for scene understanding. In Proceedings of the european conference on computer vision (ECCV), pp. 418\u2013434.","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"2741_CR69","unstructured":"Xiao, J., Yang, L., Fan, Y., Wang, J., & Luo, Z.-Q. (2022). Understanding adversarial robustness against on-manifold adversarial examples. arXiv preprint arXiv:2210.00430."},{"key":"2741_CR70","unstructured":"Xie, J., Li, W., Zhan, X., Liu, Z., Ong, Y.S., & Loy, C.C. (2022). Masked frequency modeling for self-supervised visual pre-training. arXiv preprint arXiv:2206.07706."},{"key":"2741_CR71","doi-asserted-by":"crossref","unstructured":"Xie, C., Tan, M., Gong, B., Wang, J., Yuille, A.L., & Le, Q.V. (2020). Adversarial examples improve image recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 819\u2013828.","DOI":"10.1109\/CVPR42600.2020.00090"},{"key":"2741_CR72","doi-asserted-by":"crossref","unstructured":"Xie, Z., Zhang, Z., Cao, Y., Lin, Y., Bao, J., Yao, Z., Dai, Q., & Hu, H. (2022). Simmim: A simple framework for masked image modeling. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 9653\u20139663.","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"2741_CR73","doi-asserted-by":"crossref","unstructured":"Xie, C., Zhang, Z., Zhou, Y., Bai, S., Wang, J., Ren, Z., & Yuille, A.L. (2019). Improving transferability of adversarial examples with input diversity. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 2730\u20132739.","DOI":"10.1109\/CVPR.2019.00284"},{"key":"2741_CR74","unstructured":"You, Y., Gitman, I., & Ginsburg, B. (2017). Large batch training of convolutional networks. arXiv preprint arXiv:1708.03888."},{"key":"2741_CR75","unstructured":"You, Z., Liu, D., & Xu, C. (2023). Beyond pretrained features: Noisy image modeling provides adversarial defense. arXiv preprint arXiv:2302.01056."},{"key":"2741_CR76","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S.J., Chun, S., Choe, J., & Yoo, Y. (2019). Cutmix: Regularization strategy to train strong classifiers with localizable features. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6023\u20136032.","DOI":"10.1109\/ICCV.2019.00612"},{"key":"2741_CR77","unstructured":"Zhang, H., Cisse, M., Dauphin, Y.N., & Lopez-Paz, D. (2018). mixup: Beyond empirical risk minimization. In International conference on learning representations."},{"key":"2741_CR78","unstructured":"Zhang, H., Yu, Y., Jiao, J., Xing, E., El\u00a0Ghaoui, L., & Jordan, M. (2019). Theoretically principled trade-off between robustness and accuracy. In International conference on machine learning, pp. 7472\u20137482. PMLR"},{"key":"2741_CR79","unstructured":"Zhou, J., Wei, C., Wang, H., Shen, W., Xie, C., Yuille, A., & Kong, T. (2022). Image BERT pre-training with online tokenizer. In International conference on learning representations."},{"key":"2741_CR80","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., & Torralba, A. (2017). Scene parsing through ade20k dataset. In Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2017.544"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02741-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02741-2","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02741-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T07:33:55Z","timestamp":1779348835000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02741-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,6]]},"references-count":80,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["2741"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02741-2","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,6]]},"assertion":[{"value":"16 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no competing financial interests or personal relationships that could influence the work reported in this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"159"}}