{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T07:40:33Z","timestamp":1769067633140,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research and Development Project","award":["No.2020AAA0106200"],"award-info":[{"award-number":["No.2020AAA0106200"]}]},{"name":"National Nature Science Foundation of China","award":["No.61936005, No.62325206"],"award-info":[{"award-number":["No.61936005, No.62325206"]}]},{"name":"Opening Foundation of Key Laboratory of Computer Vision and System, Ministry of Education, Tianjin University of Technology, China"},{"name":"Graduate Research and Innovation Projects in Jiangsu Province","award":["KYCX23_1022"],"award-info":[{"award-number":["KYCX23_1022"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612222","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"5089-5098","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Self-PT: Adaptive Self-Prompt Tuning for Low-Resource Visual Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8051-3070","authenticated-orcid":false,"given":"Bowen","family":"Yuan","sequence":"first","affiliation":[{"name":"Nanjing University of Posts and Telecommunications, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2173-1694","authenticated-orcid":false,"given":"Sisi","family":"You","sequence":"additional","affiliation":[{"name":"Nanjing University of Posts and Telecommunications, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5956-831X","authenticated-orcid":false,"given":"Bing-Kun","family":"Bao","sequence":"additional","affiliation":[{"name":"Nanjing University of Posts and Telecommunications &amp; Peng Cheng Laboratory, Nanjing &amp; Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.168"},{"key":"e_1_3_2_1_4_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Cho Jaemin","year":"2021","unstructured":"Jaemin Cho, Jie Lei, Hao Tan, and Mohit Bansal. 2021. Unifying vision-and-language tasks via text generation. In International Conference on Machine Learning. PMLR, 1931--1942."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.446"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.576"},{"key":"e_1_3_2_1_10_1","volume-title":"arXiv preprint arXiv:1609.09106","author":"Ha David","year":"2016","unstructured":"David Ha, Andrew Dai, and Quoc V Le. 2016. Hypernetworks. arXiv preprint arXiv:1609.09106 (2016)."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Learning Representations.","author":"He Junxian","year":"2021","unstructured":"Junxian He, Chunting Zhou, Xuezhe Ma, Taylor Berg-Kirkpatrick, and Graham Neubig. 2021. Towards a Unified View of Parameter-Efficient Transfer Learning. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Machine Learning. PMLR, 8678--8690","author":"He Yun","year":"2022","unstructured":"Yun He, Steven Zheng, Yi Tay, Jai Gupta, Yu Du, Vamsi Aribandi, Zhe Zhao, YaGuang Li, Zhao Chen, Donald Metzler, et al. 2022. Hyperprompt: Prompt-based task-conditioning of transformers. In International Conference on Machine Learning. PMLR, 8678--8690."},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Machine Learning. PMLR, 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International Conference on Machine Learning. PMLR, 2790--2799."},{"key":"e_1_3_2_1_14_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al. 2021. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"e_1_3_2_1_16_1","volume-title":"Tel Aviv","author":"Jia Menglin","year":"2022","unstructured":"Menglin Jia, Luming Tang, Bor-Chun Chen, Claire Cardie, Serge Belongie, Bharath Hariharan, and Ser-Nam Lim. 2022. Visual prompt tuning. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXIII. Springer, 709--727."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02318"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.197"},{"key":"e_1_3_2_1_19_1","first-page":"1022","article-title":"Compacter: Efficient low-rank hypercomplex adapter layers","volume":"34","author":"Mahabadi Rabeeh Karimi","year":"2021","unstructured":"Rabeeh Karimi Mahabadi, James Henderson, and Sebastian Ruder. 2021. Compacter: Efficient low-rank hypercomplex adapter layers. Advances in Neural Information Processing Systems, Vol. 34 (2021), 1022--1035.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Machine Learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583--5594."},{"key":"e_1_3_2_1_21_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_23_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"e_1_3_2_1_26_1","volume-title":"GPT understands, too. arXiv preprint arXiv:2103.10385","author":"Liu Xiao","year":"2021","unstructured":"Xiao Liu, Yanan Zheng, Zhengxiao Du, Ming Ding, Yujie Qian, Zhilin Yang, and Jie Tang. 2021. GPT understands, too. arXiv preprint arXiv:2103.10385 (2021)."},{"key":"e_1_3_2_1_27_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"UniAdapter: Unified Parameter-Efficient Transfer Learning for Cross-modal Modeling. arXiv preprint arXiv:2302.06605","author":"Lu Haoyu","year":"2023","unstructured":"Haoyu Lu, Mingyu Ding, Yuqi Huo, Guoxing Yang, Zhiwu Lu, Masayoshi Tomizuka, and Wei Zhan. 2023. UniAdapter: Unified Parameter-Efficient Transfer Learning for Cross-modal Modeling. arXiv preprint arXiv:2302.06605 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). 565--576","author":"Mahabadi Rabeeh Karimi","year":"2021","unstructured":"Rabeeh Karimi Mahabadi, Sebastian Ruder, Mostafa Dehghani, and James Henderson. 2021. Parameter-efficient Multi-task Fine-tuning for Transformers via Shared Hypernetworks. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). 565--576."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"e_1_3_2_1_31_1","volume-title":"True few-shot learning with language models. Advances in neural information processing systems","author":"Perez Ethan","year":"2021","unstructured":"Ethan Perez, Douwe Kiela, and Kyunghyun Cho. 2021. True few-shot learning with language models. Advances in neural information processing systems, Vol. 34 (2021), 11054--11070."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.39"},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"e_1_3_2_1_35_1","volume-title":"Cross-domain Federated Adaptive Prompt Tuning for CLIP. arXiv preprint arXiv:2211.07864","author":"Su Shangchao","year":"2022","unstructured":"Shangchao Su, Mingzhao Yang, Bin Li, and Xiangyang Xue. 2022. Cross-domain Federated Adaptive Prompt Tuning for CLIP. arXiv preprint arXiv:2211.07864 (2022)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"e_1_3_2_1_37_1","first-page":"24193","article-title":"Training neural networks with fixed sparse masks","volume":"34","author":"Sung Yi-Lin","year":"2021","unstructured":"Yi-Lin Sung, Varun Nair, and Colin A Raffel. 2021. Training neural networks with fixed sparse masks. Advances in Neural Information Processing Systems, Vol. 34 (2021), 24193--24205.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547830"},{"key":"e_1_3_2_1_39_1","first-page":"200","article-title":"Multimodal few-shot learning with frozen language models","volume":"34","author":"Tsimpoukelli Maria","year":"2021","unstructured":"Maria Tsimpoukelli, Jacob L Menick, Serkan Cabi, SM Eslami, Oriol Vinyals, and Felix Hill. 2021. Multimodal few-shot learning with frozen language models. Advances in Neural Information Processing Systems, Vol. 34 (2021), 200--212.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning. PMLR, 23318--23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022b. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International Conference on Machine Learning. PMLR, 23318--23340."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.388"},{"key":"e_1_3_2_1_43_1","volume-title":"Tel Aviv","author":"Wang Zifeng","year":"2022","unstructured":"Zifeng Wang, Zizhao Zhang, Sayna Ebrahimi, Ruoxi Sun, Han Zhang, Chen-Yu Lee, Xiaoqi Ren, Guolong Su, Vincent Perot, Jennifer Dy, et al. 2022c. Dualprompt: Complementary prompting for rehearsal-free continual learning. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXVI. Springer, 631--648."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00024"},{"key":"e_1_3_2_1_45_1","volume-title":"Prompt Tuning for Generative Multimodal Pretrained Models. arXiv preprint arXiv:2208.02532","author":"Yang Hao","year":"2022","unstructured":"Hao Yang, Junyang Lin, An Yang, Peng Wang, Chang Zhou, and Hongxia Yang. 2022b. Prompt Tuning for Generative Multimodal Pretrained Models. arXiv preprint arXiv:2208.02532 (2022)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-short.1"},{"key":"e_1_3_2_1_48_1","volume-title":"Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. In International Conference on Machine Learning. PMLR, 25994--26009","author":"Zeng Yan","year":"2022","unstructured":"Yan Zeng, Xinsong Zhang, and Hang Li. 2022. Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. In International Conference on Machine Learning. PMLR, 25994--26009."},{"key":"e_1_3_2_1_49_1","volume-title":"International Conference on Learning Representations.","author":"Zhang Aston","year":"2020","unstructured":"Aston Zhang, Yi Tay, SHUAI Zhang, Alvin Chan, Anh Tuan Luu, Siu Hui, and Jie Fu. 2020. Beyond Fully-Connected Layers with Quaternions: Parameterization of Hypercomplex Multiplications with 1\/n Parameters. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_51_1","volume-title":"Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning. In The Eleventh International Conference on Learning Representations.","author":"Zhang Qingru","year":"2022","unstructured":"Qingru Zhang, Minshuo Chen, Alexander Bukharin, Pengcheng He, Yu Cheng, Weizhu Chen, and Tuo Zhao. 2022a. Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_52_1","volume-title":"Hyperpelt: Unified parameter-efficient language model tuning for both language and vision-and-language tasks. arXiv preprint arXiv:2203.03878","author":"Zhang Zhengkun","year":"2022","unstructured":"Zhengkun Zhang, Wenya Guo, Xiaojun Meng, Yasheng Wang, Yadao Wang, Xin Jiang, Qun Liu, and Zhenglu Yang. 2022b. Hyperpelt: Unified parameter-efficient language model tuning for both language and vision-and-language tasks. arXiv preprint arXiv:2203.03878 (2022)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612222","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612222","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T16:24:33Z","timestamp":1769012673000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612222"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":54,"alternative-id":["10.1145\/3581783.3612222","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612222","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}