{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:04:24Z","timestamp":1765357464284,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62141209"],"award-info":[{"award-number":["62141209"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61932007"],"award-info":[{"award-number":["61932007"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Natural Science Foundation of China","award":["61972013"],"award-info":[{"award-number":["61972013"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611858","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"4678-4687","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["LiFT: Transfer Learning in Vision-Language Models for Downstream Adaptation and Generalization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4232-8798","authenticated-orcid":false,"given":"Jingzheng","family":"Li","sequence":"first","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7654-5574","authenticated-orcid":false,"given":"Hailong","family":"Sun","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language. arXiv preprint arXiv:2202.03555","author":"Baevski Alexei","year":"2022","unstructured":"Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, and Michael Auli. 2022. Data2vec: A general framework for self-supervised learning in speech, vision and language. arXiv preprint arXiv:2202.03555 (2022)."},{"key":"e_1_3_2_1_2_1","volume-title":"BEiT: BERT Pre-Training of Image Transformers. In International Conference on Learning Representations.","author":"Bao Hangbo","year":"2022","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2022. BEiT: BERT Pre-Training of Image Transformers. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_1_4_1","volume-title":"Open-World Semi-Supervised Learning. In International Conference on Learning Representations.","author":"Cao Kaidi","year":"2022","unstructured":"Kaidi Cao, Maria Brbic, and Jure Leskovec. 2022. Open-World Semi-Supervised Learning. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_5_1","volume-title":"AdaptFormer: Adapting Vision Transformers for Scalable Visual Recognition. Advances in Neural Information Processing Systems","author":"Chen Shoufa","year":"2022","unstructured":"Shoufa Chen, Chongjian Ge, Zhan Tong, Jiangliu Wang, Yibing Song, Jue Wang, and Ping Luo. 2022. AdaptFormer: Adapting Vision Transformers for Scalable Visual Recognition. Advances in Neural Information Processing Systems (2022)."},{"key":"e_1_3_2_1_6_1","volume-title":"International conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Learning Representations","author":"Chen Xi","year":"2023","unstructured":"Xi Chen, Xiao Wang, Soravit Changpinyo, AJ Piergiovanni, Piotr Padlewski, Daniel Salz, Sebastian Goodman, Adam Grycner, Basil Mustafa, Lucas Beyer, et al. 2023. Pali: A jointly-scaled multilingual language-image model. International Conference on Learning Representations (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Wooyoung Kang, and Byungseok Roh.","author":"Cho Han-Cheol","year":"2023","unstructured":"Han-Cheol Cho, Won Young Jhoo, Wooyoung Kang, and Byungseok Roh. 2023. Open-Vocabulary Object Detection using Pseudo Caption Labels. arXiv preprint arXiv:2303.13040 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.383"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00915"},{"key":"e_1_3_2_1_14_1","volume-title":"Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544","author":"Gao Peng","year":"2021","unstructured":"Peng Gao, Shijie Geng, Renrui Zhang, Teli Ma, Rongyao Fang, Yongfeng Zhang, Hongsheng Li, and Yu Qiao. 2021. Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544 (2021)."},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Learning Representations","author":"Han Kai","year":"2020","unstructured":"Kai Han, Sylvestre-Alvise Rebuffi, Sebastien Ehrhardt, Andrea Vedaldi, and Andrew Zisserman. 2020. Automatically discovering and learning new visual categories with ranking statistics. International Conference on Learning Representations (2020)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00849"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Machine Learning. PMLR, 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International Conference on Machine Learning. PMLR, 2790--2799."},{"key":"e_1_3_2_1_21_1","volume-title":"Universal language model fine-tuning for text classification. arXiv preprint arXiv:1801.06146","author":"Howard Jeremy","year":"2018","unstructured":"Jeremy Howard and Sebastian Ruder. 2018. Universal language model fine-tuning for text classification. arXiv preprint arXiv:1801.06146 (2018)."},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Learning Representations.","author":"Hsu Yen-Chang","year":"2018","unstructured":"Yen-Chang Hsu, Zhaoyang Lv, and Zsolt Kira. 2018. Learning to cluster in order to transfer across domains and tasks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Learning Representations.","author":"Hsu Yen-Chang","year":"2019","unstructured":"Yen-Chang Hsu, Zhaoyang Lv, Joel Schlosser, Phillip Odom, and Zsolt Kira. 2019. Multi-class classification without multi-class labels. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_24_1","volume-title":"Unsupervised Prompt Learning for Vision-Language Models. arXiv preprint arXiv:2204.03649","author":"Huang Tony","year":"2022","unstructured":"Tony Huang, Jack Chu, and Fangyun Wei. 2022. Unsupervised Prompt Learning for Vision-Language Models. arXiv preprint arXiv:2204.03649 (2022)."},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583--5594."},{"key":"e_1_3_2_1_28_1","volume-title":"How to Adapt Your Large-Scale Vision-and-Language Model. https:\/\/openreview.net\/pdf?id=EhwEUb2ynIa","author":"Konwoo Kim","year":"2022","unstructured":"Kim Konwoo, Laskin Michael, Mordatch Igor, and Pathak Deepak. 2022. How to Adapt Your Large-Scale Vision-and-Language Model. https:\/\/openreview.net\/pdf?id=EhwEUb2ynIa (2022)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_2_1_30_1","volume-title":"International Conference on Learning Representations.","author":"Kumar Ananya","year":"2022","unstructured":"Ananya Kumar, Aditi Raghunathan, Robbie Matthew Jones, Tengyu Ma, and Percy Liang. 2022. Fine-Tuning can Distort Pretrained Features and Underperform Out-of-Distribution. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_31_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Kuo Weicheng","year":"2023","unstructured":"Weicheng Kuo, Yin Cui, Xiuye Gu, AJ Piergiovanni, and Anelia Angelova. 2023. Open-vocabulary object detection upon frozen vision and language models. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_32_1","volume-title":"The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691","author":"Lester Brian","year":"2021","unstructured":"Brian Lester, Rami Al-Rfou, and Noah Constant. 2021. The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)."},{"key":"e_1_3_2_1_33_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_34_1","volume-title":"Masked Unsupervised Self-training for Zero-shot Image Classification. International Conference on Learning Representations","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Silvio Savarese, and Steven CH Hoi. 2023. Masked Unsupervised Self-training for Zero-shot Image Classification. International Conference on Learning Representations (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547861"},{"key":"e_1_3_2_1_37_1","volume-title":"prompt, and predict: A systematic survey of prompting methods in natural language processing. arXiv preprint arXiv:2107.13586","author":"Liu Pengfei","year":"2021","unstructured":"Pengfei Liu, Weizhe Yuan, Jinlan Fu, Zhengbao Jiang, Hiroaki Hayashi, and Graham Neubig. 2021. Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. arXiv preprint arXiv:2107.13586 (2021)."},{"key":"e_1_3_2_1_38_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_39_1","volume-title":"Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983","author":"Loshchilov Ilya","year":"2016","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"e_1_3_2_1_41_1","volume-title":"Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, Matthew Blaschko, and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_43_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.7"},{"key":"e_1_3_2_1_46_1","volume-title":"Minh-Thang Luong, Mingxing Tan, and Quoc V Le.","author":"Pham Hieu","year":"2021","unstructured":"Hieu Pham, Zihang Dai, Golnaz Ghiasi, Hanxiao Liu, Adams Wei Yu, Minh-Thang Luong, Mingxing Tan, and Quoc V Le. 2021. Combined scaling for zero-shot transfer learning. arXiv preprint arXiv:2111.10050 (2021)."},{"key":"e_1_3_2_1_47_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_48_1","volume-title":"International Conference on Machine Learning. PMLR, 5389--5400","author":"Recht Benjamin","year":"2019","unstructured":"Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, and Vaishaal Shankar. 2019. Do imagenet classifiers generalize to imagenet?. In International Conference on Machine Learning. PMLR, 5389--5400."},{"key":"e_1_3_2_1_49_1","volume-title":"Rethinking the Openness of CLIP. arXiv preprint arXiv:2206.01986","author":"Ren Shuhuai","year":"2022","unstructured":"Shuhuai Ren, Lei Li, Xuancheng Ren, Guangxiang Zhao, and Xu Sun. 2022. Rethinking the Openness of CLIP. arXiv preprint arXiv:2206.01986 (2022)."},{"key":"e_1_3_2_1_50_1","volume-title":"K-lite: Learning transferable visual models with external knowledge. arXiv preprint arXiv:2204.09222","author":"Shen Sheng","year":"2022","unstructured":"Sheng Shen, Chunyuan Li, Xiaowei Hu, Yujia Xie, Jianwei Yang, Pengchuan Zhang, Anna Rohrbach, Zhe Gan, Lijuan Wang, Lu Yuan, et al. 2022. K-lite: Learning transferable visual models with external knowledge. arXiv preprint arXiv:2204.09222 (2022)."},{"key":"e_1_3_2_1_51_1","volume-title":"Test-time prompt tuning for zero-shot generalization in vision-language models. Advances in Neural Information Processing Systems","author":"Shu Manli","year":"2022","unstructured":"Manli Shu, Weili Nie, De-An Huang, Zhiding Yu, Tom Goldstein, Anima Anandkumar, and Chaowei Xiao. 2022. Test-time prompt tuning for zero-shot generalization in vision-language models. Advances in Neural Information Processing Systems (2022)."},{"key":"e_1_3_2_1_52_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_53_1","volume-title":"International conference on machine learning. PMLR, 9229--9248","author":"Sun Yu","year":"2020","unstructured":"Yu Sun, Xiaolong Wang, Zhuang Liu, John Miller, Alexei Efros, and Moritz Hardt. 2020. Test-time training with self-supervision for generalization under distribution shifts. In International conference on machine learning. PMLR, 9229--9248."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02051"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00734"},{"key":"e_1_3_2_1_57_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Wang Haohan","year":"2019","unstructured":"Haohan Wang, Songwei Ge, Zachary Lipton, and Eric P Xing. 2019. Learning robust global representations by penalizing local predictive power. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_58_1","volume-title":"Saksham Singhal, Subhojit Som, et al.","author":"Wang Wenhui","year":"2022","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, et al. 2022. Image as a foreign language: Beit pretraining for all vision and vision-language tasks. arXiv preprint arXiv:2208.10442 (2022)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00780"},{"volume-title":"Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition","author":"Xiao Jianxiong","key":"e_1_3_2_1_60_1","unstructured":"Jianxiong Xiao, James Hays, Krista A Ehinger, Aude Oliva, and Antonio Torralba. 2010. Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition. IEEE, 3485--3492."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01857"},{"key":"e_1_3_2_1_62_1","volume-title":"FILIP: Fine-grained Interactive Language-Image Pre-Training. In International Conference on Learning Representations.","author":"Yao Lewei","year":"2021","unstructured":"Lewei Yao, Runhui Huang, Lu Hou, Guansong Lu, Minzhe Niu, Hang Xu, Xiaodan Liang, Zhenguo Li, Xin Jiang, and Chunjing Xu. 2021. FILIP: Fine-grained Interactive Language-Image Pre-Training. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_63_1","volume-title":"How transferable are features in deep neural networks? Advances in neural information processing systems","author":"Yosinski Jason","year":"2014","unstructured":"Jason Yosinski, Jeff Clune, Yoshua Bengio, and Hod Lipson. 2014. How transferable are features in deep neural networks? Advances in neural information processing systems, Vol. 27 (2014)."},{"key":"e_1_3_2_1_64_1","volume-title":"Towards a Unified View on Visual Parameter-Efficient Transfer Learning. arXiv preprint arXiv:2210.00788","author":"Yu Bruce XB","year":"2022","unstructured":"Bruce XB Yu, Jianlong Chang, Lingbo Liu, Qi Tian, and Chang Wen Chen. 2022a. Towards a Unified View on Visual Parameter-Efficient Transfer Learning. arXiv preprint arXiv:2210.00788 (2022)."},{"key":"e_1_3_2_1_65_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022b. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_66_1","volume-title":"Unified Vision and Language Prompt Learning. arXiv preprint arXiv:2210.07225","author":"Zang Yuhang","year":"2022","unstructured":"Yuhang Zang, Wei Li, Kaiyang Zhou, Chen Huang, and Chen Change Loy. 2022. Unified Vision and Language Prompt Learning. arXiv preprint arXiv:2210.07225 (2022)."},{"key":"e_1_3_2_1_67_1","first-page":"18408","article-title":"Flexmatch: Boosting semi-supervised learning with curriculum pseudo labeling","volume":"34","author":"Zhang Bowen","year":"2021","unstructured":"Bowen Zhang, Yidong Wang, Wenxin Hou, Hao Wu, Jindong Wang, Manabu Okumura, and Takahiro Shinozaki. 2021. Flexmatch: Boosting semi-supervised learning with curriculum pseudo labeling. Advances in Neural Information Processing Systems, Vol. 34 (2021), 18408--18419.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_68_1","volume-title":"European conference on computer vision","author":"Zhang Renrui","year":"2022","unstructured":"Renrui Zhang, Rongyao Fang, Peng Gao, Wei Zhang, Kunchang Li, Jifeng Dai, Yu Qiao, and Hongsheng Li. 2022a. Tip-adapter: Training-free clip-adapter for better vision-language modeling. European conference on computer vision (2022)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548396"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_72_1","volume-title":"Prompt-aligned Gradient for Prompt Tuning. arXiv preprint arXiv:2205.14865","author":"Zhu Beier","year":"2022","unstructured":"Beier Zhu, Yulei Niu, Yucheng Han, Yue Wu, and Hanwang Zhang. 2022. Prompt-aligned Gradient for Prompt Tuning. arXiv preprint arXiv:2205.14865 (2022)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611858","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611858","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:26Z","timestamp":1755820826000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611858"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":72,"alternative-id":["10.1145\/3581783.3611858","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611858","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}