{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T21:31:26Z","timestamp":1773869486117,"version":"3.50.1"},"reference-count":75,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T00:00:00Z","timestamp":1743811200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T00:00:00Z","timestamp":1743811200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s11263-025-02412-8","type":"journal-article","created":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T23:32:51Z","timestamp":1743895971000},"page":"5083-5098","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Diffusion-Enhanced Test-Time Adaptation with Text and Image Augmentation"],"prefix":"10.1007","volume":"133","author":[{"given":"Chun-Mei","family":"Feng","sequence":"first","affiliation":[]},{"given":"Yuanyang","family":"He","sequence":"additional","affiliation":[]},{"given":"Jian","family":"Zou","sequence":"additional","affiliation":[]},{"given":"Salman","family":"Khan","sequence":"additional","affiliation":[]},{"given":"Huan","family":"Xiong","sequence":"additional","affiliation":[]},{"given":"Zhen","family":"Li","sequence":"additional","affiliation":[]},{"given":"Wangmeng","family":"Zuo","sequence":"additional","affiliation":[]},{"given":"Rick Siow Mong","family":"Goh","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,5]]},"reference":[{"key":"2412_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et\u00a0al. (2023). Gpt-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"2412_CR2","unstructured":"Antoniou, A., Storkey, A., & Edwards, H. (2017). Data augmentation generative adversarial networks. arXiv preprint arXiv:1711.04340"},{"key":"2412_CR3","unstructured":"Bansal, H., & Grover, A. (2023). Leaving reality to imagination: Robust classification via generated datasets. arXiv preprint arXiv:2302.02503"},{"key":"2412_CR4","unstructured":"BELLEGroup, (2023). Belle: Be everyone\u2019s large language model engine. https:\/\/github.com\/LianjiaTech\/BELLE."},{"key":"2412_CR5","doi-asserted-by":"crossref","unstructured":"Bolya, D., & Hoffman, J. (2023). Token merging for fast stable diffusion. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4598\u20134602.","DOI":"10.1109\/CVPRW59228.2023.00484"},{"key":"2412_CR6","doi-asserted-by":"crossref","unstructured":"Bossard, L., Guillaumin, M., & Van\u00a0Gool, L. (2014). Food-101\u2013mining discriminative components with random forests. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part VI 13, Springer, pp 446\u2013461.","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"2412_CR7","unstructured":"Brock, A., Donahue, J., & Simonyan, K. (2018). Large scale gan training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096."},{"key":"2412_CR8","doi-asserted-by":"crossref","unstructured":"Chen, D., Wang, D., Darrell, T., & Ebrahimi, S. (2022). Contrastive test-time adaptation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 295\u2013305.","DOI":"10.1109\/CVPR52688.2022.00039"},{"key":"2412_CR9","unstructured":"Chen, T., Kornblith, S., Norouzi, M., & Hinton, G. (2020). A simple framework for contrastive learning of visual representations. In International conference on machine learning, PMLR, pp 1597\u20131607."},{"key":"2412_CR10","unstructured":"Chiang, W.L., Li, Z., Lin, Z., Sheng, Y., Wu, Z., Zhang, H., Zheng, L., Zhuang, S., Zhuang, Y., Gonzalez, J.E., et\u00a0al. (2023). Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna lmsys org (accessed 14 April 2023) 2(3):6."},{"key":"2412_CR11","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., & Vedaldi, A. (2014). Describing textures in the wild. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3606\u20133613.","DOI":"10.1109\/CVPR.2014.461"},{"key":"2412_CR12","unstructured":"Dai, H., Liu, Z., Liao, W., Huang, X., Cao, Y., Wu, Z., Zhao, L., Xu, S., Liu, W., Liu, N., et\u00a0al. (2023). Auggpt: Leveraging chatgpt for text data augmentation. arXiv preprint arXiv:2302.13007."},{"key":"2412_CR13","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In 2009 IEEE conference on computer vision and pattern recognition, IEEE, 248\u2013255.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2412_CR14","unstructured":"Devlin, J., Chang, M.W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805."},{"key":"2412_CR15","doi-asserted-by":"crossref","unstructured":"Fei-Fei, L., Fergus, R., & Perona, P. (2004). Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories. In 2004 conference on computer vision and pattern recognition workshop, IEEE, pp 178\u2013178.","DOI":"10.1109\/CVPR.2004.383"},{"key":"2412_CR16","doi-asserted-by":"crossref","unstructured":"Feng, C.M., Li, B., Xu, X., Liu, Y., Fu, H., & Zuo, W. (2023a). Learning federated visual prompt in null space for mri reconstruction. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 8064\u20138073.","DOI":"10.1109\/CVPR52729.2023.00779"},{"key":"2412_CR17","doi-asserted-by":"crossref","unstructured":"Feng, C.M., Yu, K., Liu, Y., Khan, S., & Zuo, W. (2023b). Diverse data augmentation with diffusions for effective test-time prompt tuning. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 2704\u20132714.","DOI":"10.1109\/ICCV51070.2023.00255"},{"key":"2412_CR18","unstructured":"Gao, P., Geng, S., Zhang, R., Ma, T., Fang, R., Zhang, Y., Li, H., & Qiao, Y. (2021). Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544."},{"issue":"2","key":"2412_CR19","doi-asserted-by":"publisher","first-page":"581","DOI":"10.1007\/s11263-023-01891-x","volume":"132","author":"P Gao","year":"2024","unstructured":"Gao, P., Geng, S., Zhang, R., Ma, T., Fang, R., Zhang, Y., Li, H., & Qiao, Y. (2024). Clip-adapter: Better vision-language models with feature adapters. International Journal of Computer Vision, 132(2), 581\u2013595.","journal-title":"International Journal of Computer Vision"},{"key":"2412_CR20","unstructured":"Gao, Y., Shi, X., Zhu, Y., Wang, H., Tang, Z., Zhou, X., Li, M., & Metaxas, D.N. (2022). Visual prompt tuning for test-time domain adaptation. arXiv preprint arXiv:2210.04831."},{"issue":"11","key":"2412_CR21","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2020). Generative adversarial networks. Communications of the ACM, 63(11), 139\u2013144.","journal-title":"Communications of the ACM"},{"issue":"7","key":"2412_CR22","doi-asserted-by":"publisher","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","volume":"12","author":"P Helber","year":"2019","unstructured":"Helber, P., Bischke, B., Dengel, A., & Borth, D. (2019). Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing, 12(7), 2217\u20132226.","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"2412_CR23","unstructured":"Hendrycks, D., Mu, N., Cubuk, E.D., Zoph, B., Gilmer, J., & Lakshminarayanan, B. (2019). Augmix: A simple data processing method to improve robustness and uncertainty. arXiv preprint arXiv:1912.02781."},{"key":"2412_CR24","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Basart, S., Mu, N., Kadavath, S., Wang, F., Dorundo, E., Desai, R., Zhu, T., Parajuli, S., & Guo, M., et\u00a0al. (2021a). The many faces of robustness: A critical analysis of out-of-distribution generalization. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 8340\u20138349.","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"2412_CR25","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., Steinhardt, J., & Song, D. (2021b). Natural adversarial examples. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 15262\u201315271.","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"2412_CR26","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems, 33, 6840\u20136851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2412_CR27","unstructured":"Ho, J., Chan, W., Saharia, C., Whang, J., Gao, R., Gritsenko, A., Kingma, D.P., Poole, B., Norouzi, M., Fleet, D.J., et\u00a0al. (2022). Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303."},{"key":"2412_CR28","unstructured":"Huang, T., Chu, J., & Wei, F. (2022). Unsupervised prompt learning for vision-language models. arXiv preprint arXiv:2204.03649."},{"key":"2412_CR29","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q., Sung, Y.H., Li, Z., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning, PMLR, pp 4904\u20134916."},{"key":"2412_CR30","doi-asserted-by":"crossref","unstructured":"Jia, M., Tang, L., Chen, B.C., Cardie, C., Belongie, S., Hariharan, B., & Lim, S.N. (2022). Visual prompt tuning. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXIII, Springer, pp 709\u2013727.","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"2412_CR31","doi-asserted-by":"crossref","unstructured":"Karmanov, A., Guan, D., Lu, S., El\u00a0Saddik, A., & Xing, E. (2024). Efficient test-time adaptation of vision-language models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14162\u201314171.","DOI":"10.1109\/CVPR52733.2024.01343"},{"key":"2412_CR32","unstructured":"Kingma, D.P., & Welling, M. (2013) Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114"},{"key":"2412_CR33","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., & Fei-Fei, L. (2013). 3d object representations for fine-grained categorization. In Proceedings of the IEEE international conference on computer vision workshops, pp 554\u2013561.","DOI":"10.1109\/ICCVW.2013.77"},{"key":"2412_CR34","unstructured":"Le\u00a0Scao, T., Fan, A., Akiki, C., Pavlick, E., Ili\u0107, S., Hesslow, D., Castagn\u00e9, R., Luccioni, A.S., Yvon, F., Gall\u00e9, M., et\u00a0al. (2023). Bloom: A 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100."},{"key":"2412_CR35","unstructured":"Li, H., Feng, C.M., Zhou, T., Xu, Y., & Chang, X. (2022a). Prompt-driven efficient open-set semi-supervised learning. arXiv preprint arXiv:2209.14205"},{"key":"2412_CR36","doi-asserted-by":"crossref","unstructured":"Li, L.H., Zhang, P., Zhang, H., Yang, J., Li, C., Zhong, Y., Wang, L., Yuan, L., Zhang, L., Hwang, J.N., et\u00a0al. (2022b). Grounded language-image pre-training. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 10965\u201310975.","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"2412_CR37","first-page":"21808","volume":"34","author":"Y Liu","year":"2021","unstructured":"Liu, Y., Kothari, P., Van Delft, B., Bellot-Gurlet, B., Mordan, T., & Alahi, A. (2021). Ttt++: When does self-supervised test-time training fail or thrive? Advances in Neural Information Processing Systems, 34, 21808\u201321820.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2412_CR38","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., & Vedaldi, A. (2013). Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151."},{"key":"2412_CR39","doi-asserted-by":"crossref","unstructured":"Mandal, D., Narayan, S., Dwivedi, S.K., Gupta, V., Ahmed, S., Khan, F.S., & Shao, L. (2019). Out-of-distribution detection for generalized zero-shot action recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 9985\u20139993.","DOI":"10.1109\/CVPR.2019.01022"},{"key":"2412_CR40","doi-asserted-by":"crossref","unstructured":"Meng, C., Rombach, R., Gao, R., Kingma, D., Ermon, S., Ho, J., & Salimans, T. (2023). On distillation of guided diffusion models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14297\u201314306.","DOI":"10.1109\/CVPR52729.2023.01374"},{"key":"2412_CR41","unstructured":"Nichol, A., Dhariwal, P., Ramesh, A., Shyam, P., Mishkin, P., McGrew, B., Sutskever, I., & Chen, M. (2021). Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741."},{"key":"2412_CR42","unstructured":"Nichol, A.Q., & Dhariwal, P. (2021). Improved denoising diffusion probabilistic models. In International Conference on Machine Learning, PMLR, pp 8162\u20138171."},{"key":"2412_CR43","doi-asserted-by":"crossref","unstructured":"Nilsback, M. E., & Zisserman, A. (2008). Automated flower classification over a large number of classes. 2008 Sixth Indian Conference on Computer Vision (pp. 722\u2013729). Graphics & Image Processing: IEEE.","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"2412_CR44","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., & Jawahar, C. (2012). Cats and dogs. In 2012 IEEE conference on computer vision and pattern recognition, IEEE, pp 3498\u20133505.","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"2412_CR45","unstructured":"Perez, L., & Wang, J. (2017). The effectiveness of data augmentation in image classification using deep learning. arXiv preprint arXiv:1712.04621."},{"key":"2412_CR46","first-page":"15606","volume":"2023","author":"F Piedboeuf","year":"2023","unstructured":"Piedboeuf, F., & Langlais, P. (2023). Is chatgpt the ultimate data augmentation algorithm? Findings of the Association for Computational Linguistics: EMNLP, 2023, 15606\u201315615.","journal-title":"Findings of the Association for Computational Linguistics: EMNLP"},{"key":"2412_CR47","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al. (2018). Improving language understanding by generative pre-training."},{"key":"2412_CR48","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, PMLR, pp 8748\u20138763."},{"key":"2412_CR49","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., & Chen, M. (2022). Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125."},{"key":"2412_CR50","unstructured":"Recht, B., Roelofs, R., Schmidt, L., & Shankar, V. (2019). Do imagenet classifiers generalize to imagenet? In International conference on machine learning, PMLR, pp 5389\u20135400."},{"key":"2412_CR51","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2412_CR52","doi-asserted-by":"crossref","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E., Ghasemipour, S.K.S., Ayan, B.K., Mahdavi, S.S., Lopes, R.G., et\u00a0al. (2022). Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487.","DOI":"10.1145\/3528233.3530757"},{"key":"2412_CR53","first-page":"11539","volume":"33","author":"S Schneider","year":"2020","unstructured":"Schneider, S., Rusak, E., Eck, L., Bringmann, O., Brendel, W., & Bethge, M. (2020). Improving robustness against common corruptions by covariate shift adaptation. Advances in Neural Information Processing Systems, 33, 11539\u201311551.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2412_CR54","doi-asserted-by":"crossref","unstructured":"Shanmugam, D., Blalock, D., Balakrishnan, G., & Guttag, J. (2021). Better aggregation in test-time augmentation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 1214\u20131223.","DOI":"10.1109\/ICCV48922.2021.00125"},{"issue":"1","key":"2412_CR55","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-019-0197-0","volume":"6","author":"C Shorten","year":"2019","unstructured":"Shorten, C., & Khoshgoftaar, T. M. (2019). A survey on image data augmentation for deep learning. Journal of big data, 6(1), 1\u201348.","journal-title":"Journal of big data"},{"key":"2412_CR56","unstructured":"Shu, M., Nie, W., Huang, D.A., Yu, Z., Goldstein, T., Anandkumar, A., & Xiao, C. (2022). Test-time prompt tuning for zero-shot generalization in vision-language models. arXiv preprint arXiv:2209.07511."},{"key":"2412_CR57","first-page":"12533","volume":"34","author":"A Sinha","year":"2021","unstructured":"Sinha, A., Song, J., Meng, C., & Ermon, S. (2021). D2c: Diffusion-decoding models for few-shot conditional generation. Advances in Neural Information Processing Systems, 34, 12533\u201312548.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2412_CR58","unstructured":"Song, Y., Dhariwal, P., Chen, M., & Sutskever, I. (2023). Consistency models. arXiv preprint arXiv:2303.01469."},{"key":"2412_CR59","unstructured":"Soomro, K., Zamir, A.R., & Shah, M. (2012). Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402."},{"key":"2412_CR60","doi-asserted-by":"crossref","unstructured":"Sun, T., Zhang, X., He, Z., Li, P., Cheng, Q., Liu, X., Yan, H., Shao, Y., Tang, Q., Zhang, S., Zhao, X., Chen, K., Zheng, Y., Zhou, Z., Li, R., Zhan, J., Zhou, Y., Li, L., Yang, X., Wu, L., Yin, Z., Huang, X., Jiang, Y.G., & Qiu, X. (2024). Moss: An open conversational large language model. Machine Intelligence Research https:\/\/github.com\/OpenMOSS\/MOSS.","DOI":"10.1007\/s11633-024-1527-z"},{"key":"2412_CR61","unstructured":"Sun, Y., Wang, X., Liu, Z., Miller, J., Efros, A., & Hardt, M. (2020). Test-time training with self-supervision for generalization under distribution shifts. In International conference on machine learning, PMLR, pp 9229\u20139248."},{"key":"2412_CR62","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et\u00a0al. (2023). Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971."},{"key":"2412_CR63","unstructured":"Ubani, S., Polat, S.O., & Nielsen, R. (2023). Zeroshotdataaug: Generating and augmenting training data with chatgpt. arXiv preprint arXiv:2304.14334."},{"key":"2412_CR64","unstructured":"Wang, D., Shelhamer, E., Liu, S., Olshausen, B., & Darrell, T. (2020). Tent: Fully test-time adaptation by entropy minimization. arXiv preprint arXiv:2006.10726."},{"key":"2412_CR65","unstructured":"Wang, H., Ge, S., Lipton, Z., & Xing, E.P. (2019). Learning robust global representations by penalizing local predictive power. Advances in Neural Information Processing Systems 32."},{"key":"2412_CR66","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K.A., Oliva, A., & Torralba, A. (2010). Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition, IEEE, pp 3485\u20133492","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"2412_CR67","unstructured":"Zhang, L., Deng, Z., Kawaguchi, K., Ghorbani, A., & Zou, J. (2020). How does mixup help with robustness and generalization? arXiv preprint arXiv:2010.04819."},{"key":"2412_CR68","unstructured":"Zhang, M., Levine, S., & Finn, C. (2021a). Memo: Test time robustness via adaptation and augmentation. arXiv preprint arXiv:2110.09506."},{"key":"2412_CR69","unstructured":"Zhang, M., Cai, Z., Pan, L., Hong, F., Guo, X., Yang, L., & Liu, Z. (2022a). Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001."},{"key":"2412_CR70","unstructured":"Zhang, R., Fang, R., Zhang, W., Gao, P., Li, K., Dai, J., Qiao, Y., & Li, H. (2021b). Tip-adapter: Training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930."},{"key":"2412_CR71","unstructured":"Zhang, T., Wang, X., Zhou, D., Schuurmans, D., & Gonzalez, J.E. (2022b). Tempera: Test-time prompting via reinforcement learning. arXiv preprint arXiv:2211.11890."},{"key":"2412_CR72","first-page":"7559","volume":"33","author":"S Zhao","year":"2020","unstructured":"Zhao, S., Liu, Z., Lin, J., Zhu, J. Y., & Han, S. (2020). Differentiable augmentation for data-efficient gan training. Advances in Neural Information Processing Systems, 33, 7559\u20137570.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2412_CR73","doi-asserted-by":"publisher","first-page":"13001","DOI":"10.1609\/aaai.v34i07.7000","volume":"34","author":"Z Zhong","year":"2020","unstructured":"Zhong, Z., Zheng, L., Kang, G., Li, S., & Yang, Y. (2020). Random erasing data augmentation. Proceedings of the AAAI conference on artificial intelligence, 34, 13001\u201313008.","journal-title":"Proceedings of the AAAI conference on artificial intelligence"},{"key":"2412_CR74","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., & Liu, Z. (2022a). Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 16816\u201316825.","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"2412_CR75","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022). Learning to prompt for vision-language models. International Journal of Computer Vision, 130(9), 2337\u20132348.","journal-title":"International Journal of Computer Vision"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02412-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02412-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02412-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T11:29:35Z","timestamp":1757158175000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02412-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,5]]},"references-count":75,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["2412"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02412-8","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,5]]},"assertion":[{"value":"1 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 September 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}