{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:46:22Z","timestamp":1765309582184,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.U21B2048 and No.62302382"],"award-info":[{"award-number":["No.U21B2048 and No.62302382"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Key Technical Projects","award":["CJGJZD2022051714160501"],"award-info":[{"award-number":["CJGJZD2022051714160501"]}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["No.2024M752584"],"award-info":[{"award-number":["No.2024M752584"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754846","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"2870-2879","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CIA: Class- and Instance-aware Adaptation for Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-1786-1946","authenticated-orcid":false,"given":"Lin","family":"Peng","sequence":"first","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2107-0007","authenticated-orcid":false,"given":"Cong","family":"Wan","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8945-1200","authenticated-orcid":false,"given":"Shaokun","family":"Wang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1740-9104","authenticated-orcid":false,"given":"Xiang","family":"Song","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6750-1403","authenticated-orcid":false,"given":"Yuhang","family":"He","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1793-5836","authenticated-orcid":false,"given":"Yihong","family":"Gong","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"446","volume-title":"Zurich","author":"Bossard Lukas","year":"2014","unstructured":"Lukas Bossard, Matthieu Guillaumin, and Luc Van Gool. 2014. Food-101-mining discriminative components with random forests. In Computer Vision-ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part VI 13. Springer, 446-461."},{"key":"e_1_3_2_2_2_1","unstructured":"Guangyi Chen Weiran Yao Xiangchen Song Xinyue Li Yongming Rao and Kun Zhang. 2022. Prompt learning with optimal transport for vision-language models. (2022)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02011"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-024-2762-5"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-024-2753-3"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.383"},{"key":"e_1_3_2_2_9_1","volume-title":"Clip-adapter: Better vision-language models with feature adapters. International Journal of Computer Vision","author":"Gao Peng","year":"2023","unstructured":"Peng Gao, Shijie Geng, Renrui Zhang, Teli Ma, Rongyao Fang, Yongfeng Zhang, Hongsheng Li, and Yu Qiao. 2023. Clip-adapter: Better vision-language models with feature adapters. International Journal of Computer Vision (2023), 1-15."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-024-2763-0"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"e_1_3_2_2_12_1","first-page":"8340","article-title":"The many faces of robustness: A critical analysis of out-of-distribution generalization","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Steven Basart, Norman Mu, Saurav Kadavath, Frank Wang, Evan Dorundo, Rahul Desai, Tyler Zhu, Samyak Parajuli, Mike Guo, et al., 2021a. The many faces of robustness: A critical analysis of out-of-distribution generalization. In ICCV. 8340-8349.","journal-title":"ICCV."},{"key":"e_1_3_2_2_13_1","first-page":"15262","article-title":"Natural adversarial examples","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Kevin Zhao, Steven Basart, Jacob Steinhardt, and Dawn Song. 2021b. Natural adversarial examples. In CVPR. 15262-15271.","journal-title":"CVPR."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02244"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-024-2755-x"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01394"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00135"},{"key":"e_1_3_2_2_21_1","volume-title":"Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. arXiv preprint arXiv:2110.05208","author":"Li Yangguang","year":"2021","unstructured":"Yangguang Li, Feng Liang, Lichen Zhao, Yufeng Cui, Wanli Ouyang, Jing Shao, Fengwei Yu, and Junjie Yan. 2021. Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. arXiv preprint arXiv:2110.05208 (2021)."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-024-2748-3"},{"key":"e_1_3_2_2_23_1","volume-title":"Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, Matthew Blaschko, and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01424"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"e_1_3_2_2_27_1","volume-title":"Global self-sustaining and local inheritance for source-free unsupervised domain adaptation. Pattern Recognition","author":"Peng Lin","year":"2024","unstructured":"Lin Peng, Yuhang He, Shaokun Wang, Xiang Song, Songlin Dong, Xing Wei, and Yihong Gong. 2024. Global self-sustaining and local inheritance for source-free unsupervised domain adaptation. Pattern Recognition (2024), 110679."},{"key":"e_1_3_2_2_28_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_2_29_1","first-page":"5389","article-title":"Do imagenet classifiers generalize to imagenet?","author":"Recht Benjamin","year":"2019","unstructured":"Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, and Vaishaal Shankar. 2019. Do imagenet classifiers generalize to imagenet?. In ICML. PMLR, 5389-5400.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02235"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i13.29427"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02826"},{"key":"e_1_3_2_2_33_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_2_34_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_2_35_1","first-page":"136576","article-title":"Prompt-agnostic adversarial perturbation for customized diffusion models","volume":"37","author":"Wan Cong","year":"2024","unstructured":"Cong Wan, Yuhang He, Xiang Song, and Yihong Gong. 2024a. Prompt-agnostic adversarial perturbation for customized diffusion models. Advances in Neural Information Processing Systems, Vol. 37 (2024), 136576-136619.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_36_1","volume-title":"Grid: Visual layout generation. arXiv e-prints","author":"Wan Cong","year":"2024","unstructured":"Cong Wan, Xiangyang Luo, Zijian Cai, Yiren Song, Yunlong Zhao, Yifan Bai, Yuhang He, and Yihong Gong. 2024b. Grid: Visual layout generation. arXiv e-prints (2024), arXiv-2412."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-024-2732-9"},{"key":"e_1_3_2_2_38_1","volume-title":"NeurIPS","volume":"32","author":"Wang Haohan","year":"2019","unstructured":"Haohan Wang, Songwei Ge, Zachary Lipton, and Eric P Xing. 2019. Learning robust global representations by penalizing local predictive power. In NeurIPS, Vol. 32."},{"key":"e_1_3_2_2_39_1","volume-title":"European Conference on Computer Vision. Springer, 144-162","author":"Wang Qiang","year":"2024","unstructured":"Qiang Wang, Yuhang He, Songlin Dong, Xinyuan Gao, Shaokun Wang, and Yihong Gong. 2024b. Non-exemplar domain incremental learning via cross-domain concept integration. In European Conference on Computer Vision. Springer, 144-162."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i20.35418"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00456"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3262739"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611926"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680983"},{"key":"e_1_3_2_2_45_1","first-page":"3485","article-title":"Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition","author":"Xiao Jianxiong","year":"2010","unstructured":"Jianxiong Xiao, James Hays, Krista A Ehinger, Aude Oliva, and Antonio Torralba. 2010. Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition. IEEE, 3485-3492.","journal-title":"IEEE"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00653"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02212"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02191"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01228"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"e_1_3_2_2_51_1","volume-title":"European Conference on Computer Vision. Springer, 161-179","author":"Zhao Zeyang","year":"2024","unstructured":"Zeyang Zhao, Qilong Xue, Yuhang He, Yifan Bai, Xing Wei, and Yihong Gong. 2024. Projecting points to axes: Oriented object detection via point-axis representation. In European Conference on Computer Vision. Springer, 161-179."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01435"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754846","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:42:50Z","timestamp":1765309370000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754846"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":54,"alternative-id":["10.1145\/3746027.3754846","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754846","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}