{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T05:16:12Z","timestamp":1773119772738,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Program of Shanghai Science and Technology Innovation Project","award":["20511100100"],"award-info":[{"award-number":["20511100100"]}]},{"name":"Natural Science Foundation of China","award":["62250055; 61831018; 61932022; 61931023; 62120106007;61971285; 61972256;T2122024; 62125109"],"award-info":[{"award-number":["62250055; 61831018; 61932022; 61931023; 62120106007;61971285; 61972256;T2122024; 62125109"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611706","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"4595-4605","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["VioLET: Vision-Language Efficient Tuning with Collaborative Multi-modal Gradients"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0026-4247","authenticated-orcid":false,"given":"Yaoming","family":"Wang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3096-448X","authenticated-orcid":false,"given":"Yuchen","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1276-1424","authenticated-orcid":false,"given":"Xiaopeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Ltd, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9846-5951","authenticated-orcid":false,"given":"Jin","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9169-7055","authenticated-orcid":false,"given":"Bowen","family":"Shi","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2888-594X","authenticated-orcid":false,"given":"Chenglin","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2522-5778","authenticated-orcid":false,"given":"Wenrui","family":"Dai","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4552-0029","authenticated-orcid":false,"given":"Hongkai","family":"Xiong","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5165-4325","authenticated-orcid":false,"given":"Qi","family":"Tian","sequence":"additional","affiliation":[{"name":"Huawei Technologies Ltd, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_1_2_1","volume-title":"PLOT: Prompt Learning with Optimal Transport for Vision-Language Models. In The Eleventh International Conference on Learning Representations.","author":"Chen Guangyi","year":"2023","unstructured":"Guangyi Chen, Weiran Yao, Xiangchen Song, Xinyue Li, Yongming Rao, and Kun Zhang. 2023. PLOT: Prompt Learning with Optimal Transport for Vision-Language Models. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_3_1","volume-title":"Adaptformer: Adapting vision transformers for scalable visual recognition. arXiv preprint arXiv:2205.13535","author":"Chen Shoufa","year":"2022","unstructured":"Shoufa Chen, Chongjian Ge, Zhan Tong, Jiangliu Wang, Yibing Song, Jue Wang, and Ping Luo. 2022. Adaptformer: Adapting vision transformers for scalable visual recognition. arXiv preprint arXiv:2205.13535 (2022)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_6_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id= YicbFdNTTy","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id= YicbFdNTTy"},{"key":"e_1_3_2_1_8_1","volume-title":"International Conference on Artificial Intelligence and Statistics. PMLR, 3762--3773","author":"Farajtabar Mehrdad","year":"2020","unstructured":"Mehrdad Farajtabar, Navid Azizan, Alex Mott, and Ang Li. 2020. Orthogonal gradient descent for continual learning. In International Conference on Artificial Intelligence and Statistics. PMLR, 3762--3773."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.383"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11174"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning. PMLR, 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International Conference on Machine Learning. PMLR, 2790--2799."},{"key":"e_1_3_2_1_16_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_17_1","volume-title":"Visual prompt tuning. arXiv preprint arXiv:2203.12119","author":"Jia Menglin","year":"2022","unstructured":"Menglin Jia, Luming Tang, Bor-Chun Chen, Claire Cardie, Serge Belongie, Bharath Hariharan, and Ser-Nam Lim. 2022. Visual prompt tuning. arXiv preprint arXiv:2203.12119 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"Convolutional bypasses are better vision transformer adapters. arXiv preprint arXiv:2207.07039","author":"Jie Shibo","year":"2022","unstructured":"Shibo Jie and Zhi-Hong Deng. 2022. Convolutional bypasses are better vision transformer adapters. arXiv preprint arXiv:2207.07039 (2022)."},{"key":"e_1_3_2_1_19_1","first-page":"1022","article-title":"Compacter: Efficient low-rank hypercomplex adapter layers","volume":"34","author":"Mahabadi Rabeeh Karimi","year":"2021","unstructured":"Rabeeh Karimi Mahabadi, James Henderson, and Sebastian Ruder. 2021. Compacter: Efficient low-rank hypercomplex adapter layers. Advances in Neural Information Processing Systems 34 (2021), 1022--1035.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_20_1","volume-title":"Maple: Multi-modal prompt learning. arXiv preprint arXiv:2210.03117","author":"Khattak Muhammad Uzair","year":"2022","unstructured":"Muhammad Uzair Khattak, Hanoona Rasheed, Muhammad Maaz, Salman Khan, and Fahad Shahbaz Khan. 2022. Maple: Multi-modal prompt learning. arXiv preprint arXiv:2210.03117 (2022)."},{"key":"e_1_3_2_1_21_1","volume-title":"The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691","author":"Lester Brian","year":"2021","unstructured":"Brian Lester, Rami Al-Rfou, and Noah Constant. 2021. The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)."},{"key":"e_1_3_2_1_22_1","volume-title":"Gradient-Regulated Meta-Prompt Learning for Generalizable Vision-Language Models. arXiv preprint arXiv:2303.06571","author":"Li Juncheng","year":"2023","unstructured":"Juncheng Li, Minghe Gao, Longhui Wei, Siliang Tang, Wenqiao Zhang, Mengze Li,Wei Ji, Qi Tian, Tat-Seng Chua, and Yueting Zhuang. 2023. Gradient-Regulated Meta-Prompt Learning for Generalizable Vision-Language Models. arXiv preprint arXiv:2303.06571 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Fine-grained semantically aligned vision-language pre-training. arXiv preprint arXiv:2208.02515","author":"Li Juncheng","year":"2022","unstructured":"Juncheng Li, Xin He, Longhui Wei, Long Qian, Linchao Zhu, Lingxi Xie, Yueting Zhuang, Qi Tian, and Siliang Tang. 2022. Fine-grained semantically aligned vision-language pre-training. arXiv preprint arXiv:2208.02515 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_25_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_26_1","volume-title":"Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190","author":"Li Xiang Lisa","year":"2021","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)."},{"key":"e_1_3_2_1_27_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"Parameter-efficient multi-task fine-tuning for transformers via shared hypernetworks. arXiv preprint arXiv:2106.04489","author":"Mahabadi Rabeeh Karimi","year":"2021","unstructured":"Rabeeh Karimi Mahabadi, Sebastian Ruder, Mostafa Dehghani, and James Henderson. 2021. Parameter-efficient multi-task fine-tuning for transformers via shared hypernetworks. arXiv preprint arXiv:2106.04489 (2021)."},{"key":"e_1_3_2_1_29_1","volume-title":"Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, MatthewBlaschko, and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.42"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_33_1","volume-title":"International Conference on Machine Learning. PMLR, 5389--5400","author":"Recht Benjamin","year":"2019","unstructured":"Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, and Vaishaal Shankar. 2019. Do imagenet classifiers generalize to imagenet?. In International Conference on Machine Learning. PMLR, 5389--5400."},{"key":"e_1_3_2_1_34_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_35_1","first-page":"24193","article-title":"Training neural networks with fixed sparse masks","volume":"34","author":"Sung Yi-Lin","year":"2021","unstructured":"Yi-Lin Sung, Varun Nair, and Colin A Raffel. 2021. Training neural networks with fixed sparse masks. Advances in Neural Information Processing Systems 34 (2021), 24193--24205.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, ?ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_37_1","volume-title":"Learning robust global representations by penalizing local predictive power. Advances in Neural Information Processing Systems 32","author":"Wang Haohan","year":"2019","unstructured":"Haohan Wang, Songwei Ge, Zachary Lipton, and Eric P Xing. 2019. Learning robust global representations by penalizing local predictive power. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01532"},{"key":"e_1_3_2_1_39_1","volume-title":"Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition","author":"Xiao Jianxiong","unstructured":"Jianxiong Xiao, James Hays, Krista A Ehinger, Aude Oliva, and Antonio Torralba. 2010. Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition. IEEE, 3485--3492."},{"key":"e_1_3_2_1_40_1","first-page":"5824","article-title":"Gradient surgery for multi-task learning","volume":"33","author":"Yu Tianhe","year":"2020","unstructured":"Tianhe Yu, Saurabh Kumar, Abhishek Gupta, Sergey Levine, Karol Hausman, and Chelsea Finn. 2020. Gradient surgery for multi-task learning. Advances in Neural Information Processing Systems 33 (2020), 5824--5836.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","volume-title":"Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199","author":"Zaken Elad Ben","year":"2021","unstructured":"Elad Ben Zaken, Shauli Ravfogel, and Yoav Goldberg. 2021. Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199 (2021)."},{"key":"e_1_3_2_1_42_1","volume-title":"Unified vision and language prompt learning. arXiv preprint arXiv:2210.07225","author":"Zang Yuhang","year":"2022","unstructured":"Yuhang Zang, Wei Li, Kaiyang Zhou, Chen Huang, and Chen Change Loy. 2022. Unified vision and language prompt learning. arXiv preprint arXiv:2210.07225 (2022)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611706","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611706","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:05:22Z","timestamp":1755821122000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611706"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":44,"alternative-id":["10.1145\/3581783.3611706","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611706","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}