{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T15:47:19Z","timestamp":1778860039376,"version":"3.51.4"},"reference-count":111,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1109\/tpami.2025.3580234","type":"journal-article","created":{"date-parts":[[2025,6,16]],"date-time":"2025-06-16T14:58:19Z","timestamp":1750085899000},"page":"8280-8296","source":"Crossref","is-referenced-by-count":6,"title":["Toward a Holistic Evaluation of Robustness in CLIP Models"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7364-298X","authenticated-orcid":false,"given":"Weijie","family":"Tu","sequence":"first","affiliation":[{"name":"School of Computing, Australian National University, Canberra, ACT, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1565-742X","authenticated-orcid":false,"given":"Weijian","family":"Deng","sequence":"additional","affiliation":[{"name":"School of Computing, Australian National University, Canberra, ACT, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8356-4909","authenticated-orcid":false,"given":"Tom","family":"Gedeon","sequence":"additional","affiliation":[{"name":"Australian National University, University of &#x00D3;buda, Budapest, Hungary"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref2","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia"},{"key":"ref3","first-page":"5389","article-title":"Do ImageNet classifiers generalize to ImageNet?","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Recht"},{"key":"ref4","first-page":"10506","article-title":"Learning robust global representations by penalizing local predictive power","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"ref6","first-page":"9453","article-title":"ObjectNet: A large-scale bias-controlled dataset for pushing the limits of object recognition models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Barbu"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"ref8","first-page":"21455","article-title":"Quality not quantity: On the interaction between dataset design and robustness of clip","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Nguyen"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00780"},{"key":"ref11","first-page":"54111","article-title":"On evaluating adversarial robustness of large vision-language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhao"},{"key":"ref12","first-page":"6216","article-title":"Data determines distributional robustness in contrastive language image pre-training (CLIP)","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Fang"},{"key":"ref13","first-page":"73543","article-title":"Effective robustness against natural distribution shifts for models with different training data","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Shi"},{"key":"ref14","article-title":"ImageNet-X: Understanding model mistakes with factor of variation annotations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Idrissi"},{"key":"ref15","first-page":"35087","article-title":"Delving into out-of-distribution detection with vision-language representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ming"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02059"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01839"},{"key":"ref18","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. neural Inf. Process. Syst.","author":"Liu"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02055"},{"key":"ref20","first-page":"13678","article-title":"A closer look at the robustness of contrastive language-image pre-training (CLIP)","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Tu"},{"key":"ref21","first-page":"27092","article-title":"DataComp: In search of the next generation of multimodal datasets","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gadre","year":"2023"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01619"},{"key":"ref23","first-page":"5637","article-title":"Wilds: A benchmark of in-the-wild distribution shifts","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Koh"},{"key":"ref24","article-title":"A note on \u201cassessing generalization of SGD via disagreement\u201d","author":"Kirsch","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref25","first-page":"19973","article-title":"Machine vision therapy: Multimodal large language models can enhance visual robustness via denoising in-context learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Huang"},{"key":"ref26","article-title":"Harnessing out-of-distribution examples via augmenting content and style","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Huang"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02075-x"},{"key":"ref28","first-page":"137","article-title":"Analysis of representations for domain adaptation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sch\u00f6lkopf"},{"key":"ref29","article-title":"Domain adaptation: Learning bounds and algorithms","author":"Mansour","year":"2009"},{"key":"ref30","article-title":"Imagenet-trained cnns are biased towards texture; increasing shape bias improves accuracy and robustness","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Geirhos"},{"key":"ref31","article-title":"Benchmarking neural network robustness to common corruptions and perturbations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hendrycks"},{"key":"ref32","first-page":"3571","article-title":"On interaction between augmentations and corruptions in natural corruption robustness","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Mintun"},{"key":"ref33","first-page":"19274","article-title":"Agreement-on-the-line: Predicting the performance of neural networks under distribution shift","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Baek"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"ref35","article-title":"Typographic attacks in large multimodal models can be alleviated by more informative prompts","author":"Cheng"},{"key":"ref36","first-page":"1321","article-title":"On calibration of modern neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Guo"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1182"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref40","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref41","article-title":"Eva-clip: Improved training techniques for clip at scale","author":"Sun","year":"2023"},{"key":"ref42","first-page":"25278","article-title":"LAION-5B: An open large-scale dataset for training next generation image-text models","volume-title":"Proc. 36th Conf. Neural Inf. Process. Syst. Datasets Benchmarks Track","author":"Schuhmann"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref44","article-title":"Data filtering networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst. Workshop Distrib. Shifts","author":"Fang"},{"key":"ref45","article-title":"Demystifying clip data","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Xu"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref51","first-page":"578","article-title":"RepMLP: Re-parameterizing convolutions into fully-connected layers for image recognition","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","author":"Ding"},{"key":"ref52","first-page":"3615","article-title":"MLP-mixer: An all-MLP architecture for vision","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Tolstikhin"},{"key":"ref53","first-page":"18583","article-title":"Measuring robustness to natural distribution shifts in image classification","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Taori"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref57","article-title":"Pytorch image models","author":"Wightman","year":"2019"},{"key":"ref58","article-title":"Openclip","author":"Ilharco","year":"2021"},{"key":"ref59","article-title":"Mistral 7B","author":"Jiang","year":"2023"},{"key":"ref60","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref61","first-page":"23123","article-title":"Prismatic VLMs: Investigating the design space of visually-conditioned language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Karamcheti"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00914"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"ref66","first-page":"2471","article-title":"In or out? fixing imagenet out-of-distribution detection evaluation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Bitterwolf"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00952"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v29i1.9602"},{"key":"ref69","first-page":"67","article-title":"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions","volume-title":"Trans. Assoc. Comput. Linguistics","volume":"2","author":"Young","year":"2014"},{"key":"ref70","article-title":"Microsoft COCO captions: Data collection and evaluation server","author":"Chen","year":"2015"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref73","first-page":"76061","article-title":"Navi: Category-agnostic image collections with high-quality 3D shape and pose annotations","volume-title":"Proc. Adv. Neural Inf. Process. Syst. Dataset Benchmark Track","author":"Jampani"},{"key":"ref74","article-title":"Spair-71 k: A large-scale benchmark for semantic correspondence","author":"Min","year":"2019"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-04898-2_594"},{"key":"ref76","first-page":"7721","article-title":"Accuracy on the line: On the strong correlation between out-of-distribution and in-distribution generalization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Miller"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01944"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00257-z"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01853"},{"key":"ref80","first-page":"23296","article-title":"Intriguing properties of vision transformers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Naseer"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00713"},{"key":"ref82","first-page":"19000","article-title":"The origins and prevalence of texture bias in convolutional neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hermann"},{"key":"ref83","article-title":"Emergence of shape bias in convolutional neural networks through activation sparsity","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref84","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tan"},{"key":"ref85","first-page":"22614","article-title":"Revisiting resnets: Improved training and scaling strategies","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Bello"},{"key":"ref86","first-page":"10096","article-title":"EfficientNetV2: Smaller models and faster training","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tan"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1167\/19.10.209c"},{"key":"ref88","first-page":"7068","article-title":"Exploring the limits of out-of-distribution detection","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Fort"},{"key":"ref89","article-title":"A Baseline for detecting misclassified and out-of-distribution examples in neural networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hendrycks"},{"key":"ref90","first-page":"15682","article-title":"Revisiting the calibration of modern neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Minderer"},{"key":"ref91","article-title":"Calibration of neural networks using splines","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gupta"},{"key":"ref92","first-page":"14003","article-title":"Can you trust your model\u2019s uncertainty? evaluating predictive uncertainty under dataset shift","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ovadia"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00297"},{"key":"ref94","first-page":"45533","article-title":"A tale of two features: Stable diffusion complements dino for zero-shot semantic correspondence","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zhang"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"ref96","article-title":"Dinov2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72640-8_7"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01438"},{"key":"ref99","article-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"ref100","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wei"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00034"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72627-9_21"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01234"},{"key":"ref104","article-title":"Fine-tuning can distort pretrained features and underperform out-of-distribution","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kumar"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01394"},{"key":"ref107","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01231"},{"key":"ref109","article-title":"Laion-aesthetics","author":"Schuhmann","year":"2022"},{"key":"ref110","article-title":"XGEN-MM (BLIP-3): A family of open large multimodal models","author":"Xue","year":"2024"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01042"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11118328\/11037467.pdf?arnumber=11037467","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T17:44:46Z","timestamp":1754588686000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11037467\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":111,"journal-issue":{"issue":"9"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3580234","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9]]}}}