{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T12:13:46Z","timestamp":1743077626477,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":51,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819786190"},{"type":"electronic","value":"9789819786206"}],"license":[{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8620-6_26","type":"book-chapter","created":{"date-parts":[[2024,10,19]],"date-time":"2024-10-19T21:02:10Z","timestamp":1729371730000},"page":"379-394","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["VL-MPFT: Multitask Parameter-Efficient Fine-Tuning for\u00a0Visual-Language Pre-trained Models via\u00a0Task-Adaptive Masking"],"prefix":"10.1007","author":[{"given":"Min","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Guanming","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Zhihua","family":"Wei","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,20]]},"reference":[{"key":"26_CR1","doi-asserted-by":"crossref","unstructured":"Aghajanyan, A., Gupta, A., Shrivastava, A., Chen, X., Zettlemoyer, L., Gupta, S.: Muppet: Massive multi-task representations with pre-finetuning. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 5799\u20135811 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.468"},{"key":"26_CR2","doi-asserted-by":"crossref","unstructured":"Asai, A., Salehi, M., Peters, M.E., Hajishirzi, H.: Attempt: Parameter-efficient multi-task tuning via attentional mixtures of soft prompts. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 6655\u20136672 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.446"},{"key":"26_CR3","first-page":"32897","volume":"35","author":"H Bao","year":"2022","unstructured":"Bao, H., Wang, W., Dong, L., Liu, Q., Mohammed, O.K., Aggarwal, K., Som, S., Piao, S., Wei, F.: Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. Adv. Neural. Inf. Process. Syst. 35, 32897\u201332912 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR4","unstructured":"Chen, X., Fang, H., Lin, T.Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco captions: Data collection and evaluation server (2015). arXiv:1504.00325"},{"key":"26_CR5","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., Li, L., Yu, L., El\u00a0Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., Liu, J.: Uniter: Universal image-text representation learning. In: European Conference on Computer Vision, pp. 104\u2013120. Springer","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"26_CR6","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: International Conference on Machine Learning, pp. 1931\u20131942. PMLR, VL-T5"},{"key":"26_CR7","doi-asserted-by":"crossref","unstructured":"Diao, S., Xu, T., Xu, R., Wang, J., Zhang, T.: Mixture-of-domain-adapters: decoupling and injecting domain knowledge to pre-trained language models\u2019 memories, pp. 5113\u20135129 (2023)","DOI":"10.18653\/v1\/2023.acl-long.280"},{"key":"26_CR8","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the v in vqa matter: elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6904\u20136913 (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"26_CR9","doi-asserted-by":"crossref","unstructured":"Guo, D., Rush, A.M., Kim, Y.: Parameter-efficient transfer learning with diff pruning. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 4884\u20134896 (2021)","DOI":"10.18653\/v1\/2021.acl-long.378"},{"key":"26_CR10","unstructured":"He, J., Zhou, C., Ma, X., Berg-Kirkpatrick, T., Neubig, G.: Towards a unified view of parameter-efficient transfer learning (2021). arXiv:2110.04366"},{"key":"26_CR11","unstructured":"Houlsby, N., Giurgiu, A., Jastrzebski, S., Morrone, B., De\u00a0Laroussilhe, Q., Gesmundo, A., Attariyan, M., Gelly, S.: Parameter-efficient transfer learning for NLP. In: International Conference on Machine Learning, pp. 2790\u20132799. PMLR (2019)"},{"key":"26_CR12","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., Chen, W.: Lora: Low-rank adaptation of large language models (2021). arXiv:2106.09685"},{"key":"26_CR13","doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A.: Unit: Multimodal multitask learning with a unified transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1439\u20131449 (2021)","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"26_CR14","unstructured":"Hu, Z.Y., Li, Y., Lyu, M.R., Wang, L.: VL-PET: Vision-and-language parameter-efficient tuning via granularity control. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3010\u20133020"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: Gqa: A new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"26_CR16","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q., Sung, Y.H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR, ALIGN"},{"key":"26_CR17","doi-asserted-by":"crossref","unstructured":"Jia, M., Tang, L., Chen, B.C., Cardie, C., Belongie, S., Hariharan, B., Lim, S.N.: Visual prompt tuning. In: European Conference on Computer Vision, pp. 709\u2013727. Springer (2022)","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"26_CR18","unstructured":"Jian, Y., Gao, C., Vosoughi, S.: Bootstrapping vision-language learning with decoupled language pre-training (2023). arXiv:2307.07063"},{"key":"26_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, J., Zheng, N.: MixPHM: Redundancy-aware parameter-efficient tuning for low-resource visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24203\u201324213","DOI":"10.1109\/CVPR52729.2023.02318"},{"key":"26_CR20","first-page":"1022","volume":"34","author":"R Karimi Mahabadi","year":"2021","unstructured":"Karimi Mahabadi, R., Henderson, J., Ruder, S.: Compacter: Efficient low-rank hypercomplex adapter layers. Adv. Neural. Inf. Process. Syst. 34, 1022\u20131035 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR21","unstructured":"Kenton, J.D.M.W.C., Toutanova, L.K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"26_CR22","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S.: Maple: Multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19113\u201319122 (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"26_CR23","doi-asserted-by":"crossref","unstructured":"Kudugunta, S., Huang, Y., Bapna, A., Krikun, M., Lepikhin, D., Luong, M.T., Firat, O.: Beyond distillation: Task-level mixture-of-experts for efficient inference. In: Findings of the Association for Computational Linguistics: EMNLP 2021, pp. 3577\u20133599 (2021)","DOI":"10.18653\/v1\/2021.findings-emnlp.304"},{"key":"26_CR24","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 3045\u20133059 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"26_CR25","unstructured":"Lewis, M., Liu, Y., Goyal, N., Ghazvininejad, M., Mohamed, A., Levy, O., Stoyanov, V., Zettlemoyer, L.: BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 7871\u20137880"},{"key":"26_CR26","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models (2023). arXiv:2301.12597"},{"key":"26_CR27","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR"},{"key":"26_CR28","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: Vision and language representation learning with momentum distillation 34, 9694\u20139705, ALBEF"},{"key":"26_CR29","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: Optimizing continuous prompts for generation. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 4582\u20134597 (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"26_CR30","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., Hu, X., Zhang, L., Wang, L., Hu, H., Dong, L., Wei, F.: Oscar: Object-semantics aligned pre-training for vision-language tasks. In: Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX 16, pp. 121\u2013137. Springer","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"26_CR31","first-page":"36889","volume":"35","author":"YC Liu","year":"2022","unstructured":"Liu, Y.C., Ma, C.Y., Tian, J., He, Z., Kira, Z.: Polyhistor: Parameter-efficient multi-task adaptation for dense vision tasks. Adv. Neural. Inf. Process. Syst. 35, 36889\u201336901 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR32","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"26_CR33","unstructured":"Mahabadi, R.K., Ruder, S., Dehghani, M., Henderson, J.: Parameter-efficient multi-task fine-tuning for transformers via shared hypernetworks. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 565\u2013576 (2021)"},{"key":"26_CR34","doi-asserted-by":"crossref","unstructured":"Ma\u00f1as, O., Lopez, P.R., Ahmadi, S., Nematzadeh, A., Goyal, Y., Agrawal, A.: Mapl: Parameter-efficient adaptation of unimodal pre-trained models for vision-language few-shot prompting. In: Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, pp. 2515\u20132540 (2023)","DOI":"10.18653\/v1\/2023.eacl-main.185"},{"key":"26_CR35","doi-asserted-by":"crossref","unstructured":"Muennighoff, N., Wang, T., Sutawika, L., Roberts, A., Biderman, S., Le\u00a0Scao, T., Bari, M.S., Shen, S., Yong, Z.X., Schoelkopf, H., et\u00a0al.: Crosslingual generalization through multitask finetuning. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 15991\u201316111 (2023)","DOI":"10.18653\/v1\/2023.acl-long.891"},{"key":"26_CR36","doi-asserted-by":"crossref","unstructured":"Pfeiffer, J., Kamath, A., R\u00fcckl\u00e9, A., Cho, K., Gurevych, I.: Adapterfusion: Non-destructive task composition for transfer learning. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pp. 487\u2013503 (2021)","DOI":"10.18653\/v1\/2021.eacl-main.39"},{"key":"26_CR37","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR, CLIP"},{"key":"26_CR38","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I.: Improving language understanding by generative pre-training. Publisher: OpenAI (2018)"},{"key":"26_CR39","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., Liu, P.J.: Exploring the limits of transfer learning with a unified text-to-text transformer 21(1), 5485\u20135551, ISBN: 1532-4435 Publisher: JMLRORG T5"},{"key":"26_CR40","unstructured":"Su, W., Zhu, X., Cao, Y., Li, B., Lu, L., Wei, F., Dai, J.: Vl-bert: Pre-training of generic visual-linguistic representations (2019). arXiv:1908.08530"},{"key":"26_CR41","doi-asserted-by":"crossref","unstructured":"Suhr, A., Zhou, S., Zhang, A., Zhang, I., Bai, H., Artzi, Y.: A corpus for reasoning about natural language grounded in photographs. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 6418\u20136428 (2019)","DOI":"10.18653\/v1\/P19-1644"},{"key":"26_CR42","unstructured":"Sung, Y.L., Cho, J., Bansal, M.: Vl-adapter: Parameter-efficient transfer learning for vision-and-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5227\u20135237"},{"key":"26_CR43","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: Lxmert: Learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 5100\u20135111 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"26_CR44","first-page":"200","volume":"34","author":"M Tsimpoukelli","year":"2021","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. Adv. Neural. Inf. Process. Syst. 34, 200\u2013212 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR45","unstructured":"Wang, P., Yang, A., Men, R., Lin, J., Bai, S., Li, Z., Ma, J., Zhou, C., Zhou, J., Yang, H.: Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR"},{"key":"26_CR46","unstructured":"Wu, C., Wang, T., Ge, Y., Lu, Z., Zhou, R., Shan, Y., Luo, P.: pi-tuning: Transferring multimodal foundation models with optimal multi-task interpolation. In: International Conference on Machine Learning, pp. 37713\u201337727. PMLR (2023)"},{"key":"26_CR47","unstructured":"Zaken, E.B., Goldberg, Y., Ravfogel, S.: Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pp. 1\u20139 (2022)"},{"key":"26_CR48","unstructured":"Zhang, P., Li, X., Hu, X., Yang, J., Zhang, L., Wang, L., Choi, Y., Gao, J.: Vinvl: Revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5579\u20135588"},{"key":"26_CR49","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16816\u201316825 (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"26_CR50","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"26_CR51","doi-asserted-by":"crossref","unstructured":"Zou, B., Yang, C., Qiao, Y., Quan, C., Zhao, Y.: Llama-excitor: General instruction tuning via indirect feature interaction (2024). arXiv:2404.00913","DOI":"10.1109\/CVPR52733.2024.01336"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8620-6_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T20:17:30Z","timestamp":1736885850000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8620-6_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,20]]},"ISBN":["9789819786190","9789819786206"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8620-6_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,20]]},"assertion":[{"value":"20 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2024.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}