{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:49:51Z","timestamp":1778082591659,"version":"3.51.4"},"publisher-location":"Singapore","reference-count":65,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819609079","type":"print"},{"value":"9789819609086","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0908-6_3","type":"book-chapter","created":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T19:24:38Z","timestamp":1733513078000},"page":"44-62","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["TuneVLSeg: Prompt Tuning Benchmark for\u00a0Vision-Language Segmentation Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5019-2205","authenticated-orcid":false,"given":"Rabin","family":"Adhikari","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4463-6700","authenticated-orcid":false,"given":"Safal","family":"Thapaliya","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0101-5592","authenticated-orcid":false,"given":"Manish","family":"Dhakal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2775-4748","authenticated-orcid":false,"given":"Bishesh","family":"Khanal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,7]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Akiba, T., Sano, S., Yanase, T., Ohta, T., Koyama, M.: Optuna: A next-generation hyperparameter optimization framework. In: Proceedings of the 25th ACM SIGKDD international conference on knowledge discovery & data mining. pp. 2623\u20132631 (2019)","DOI":"10.1145\/3292500.3330701"},{"key":"3_CR2","doi-asserted-by":"publisher","DOI":"10.1016\/j.dib.2019.104863","volume":"28","author":"W Al-Dhabyani","year":"2020","unstructured":"Al-Dhabyani, W., Gomaa, M., Khaled, H., Fahmy, A.: Dataset of breast ultrasound images. Data Brief 28, 104863 (2020)","journal-title":"Data Brief"},{"key":"3_CR3","doi-asserted-by":"publisher","first-page":"43669","DOI":"10.1109\/ACCESS.2022.3168693","volume":"10","author":"NS An","year":"2022","unstructured":"An, N.S., Lan, P.N., Hang, D.V., Long, D.V., Trung, T.Q., Thuy, N.T., Sang, D.V.: BlazeNeo: Blazing fast polyp segmentation and neoplasm detection. IEEE Access 10, 43669\u201343684 (2022)","journal-title":"IEEE Access"},{"key":"3_CR4","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. arXiv preprint arXiv:1607.06450 (2016)"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Bernal, J., S\u00e1nchez, F.J., Fern\u00e1ndez-Esparrach, G., Gil, D., Rodr\u00edguez, C., Vilari\u00f1o, F.: Wm-dova maps for accurate polyp highlighting in colonoscopy: Validation vs. saliency maps from physicians. Computerized Medical Imaging and Graphics 43, 99\u2013111 (2015)","DOI":"10.1016\/j.compmedimag.2015.02.007"},{"key":"3_CR6","unstructured":"Bordes, F., Pang, R.Y., Ajay, A., Li, A.C., Bardes, A., Petryk, S., Ma\u00f1as, O., Lin, Z., Mahmoud, A., Jayaraman, B., et\u00a0al.: An introduction to vision-language modeling. arXiv preprint arXiv:2405.17247 (2024)"},{"issue":"2","key":"3_CR7","doi-asserted-by":"publisher","first-page":"125","DOI":"10.3390\/info11020125","volume":"11","author":"A Buslaev","year":"2020","unstructured":"Buslaev, A., Iglovikov, V.I., Khvedchenya, E., Parinov, A., Druzhinin, M., Kalinin, A.A.: Albumentations: fast and flexible image augmentations. Information 11(2), 125 (2020)","journal-title":"Information"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., Li, L., Yu, L., El\u00a0Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., Liu, J.: UNITER: Universal image-text representation learning. In: European conference on computer vision. pp. 104\u2013120. Springer (2020)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Cordts, M., Omran, M., Ramos, S., Rehfeld, T., Enzweiler, M., Benenson, R., Franke, U., Roth, S., Schiele, B.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3213\u20133223 (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition. pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"3_CR11","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"3_CR12","unstructured":"Everingham, M., Winn, J.: The pascal visual object classes challenge 2012 (voc2012) development kit. Pattern Anal. Stat. Model. Comput. Learn., Tech. Rep 2007(1-45), 5 (2012)"},{"key":"3_CR13","doi-asserted-by":"crossref","unstructured":"Goyal, S., Kumar, A., Garg, S., Kolter, Z., Raghunathan, A.: Finetune like you pretrain: Improved finetuning of zero-shot vision models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 19338\u201319347 (2023)","DOI":"10.1109\/CVPR52729.2023.01853"},{"key":"3_CR14","unstructured":"Gutman, D., Codella, N.C., Celebi, E., Helba, B., Marchetti, M., Mishra, N., Halpern, A.: Skin Lesion Analysis toward Melanoma Detection: A Challenge at ISBI 2016, hosted by ISIC. arXiv preprint arXiv:1605.01397 (2016)"},{"key":"3_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3_CR16","doi-asserted-by":"crossref","unstructured":"He, T., Zhang, Z., Zhang, H., Zhang, Z., Xie, J., Li, M.: Bag of tricks for image classification with convolutional neural networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 558\u2013567 (2019)","DOI":"10.1109\/CVPR.2019.00065"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Jha, D., Smedsrud, P.H., Riegler, M.A., Halvorsen, P., de\u00a0Lange, T., Johansen, D., Johansen, H.D.: Kvasir-SEG: A segmented polyp dataset. In: MultiMedia Modeling. pp. 451\u2013462. Springer (2020)","DOI":"10.1007\/978-3-030-37734-2_37"},{"key":"3_CR18","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q., Sung, Y.H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International conference on machine learning. pp. 4904\u20134916. PMLR (2021)"},{"key":"3_CR19","doi-asserted-by":"crossref","unstructured":"Jia, M., Tang, L., Chen, B.C., Cardie, C., Belongie, S., Hariharan, B., Lim, S.N.: Visual prompt tuning. In: European Conference on Computer Vision. pp. 709\u2013727. Springer (2022)","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"3_CR20","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1162\/tacl_a_00324","volume":"8","author":"Z Jiang","year":"2020","unstructured":"Jiang, Z., Xu, F.F., Araki, J., Neubig, G.: How can we know what language models know? Transactions of the Association for Computational Linguistics 8, 423\u2013438 (2020)","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"3_CR21","doi-asserted-by":"crossref","unstructured":"Jin, W., Cheng, Y., Shen, Y., Chen, W., Ren, X.: A good prompt is worth millions of parameters: Low-resource prompt-based learning for vision-language models. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). pp. 2763\u20132775 (2022)","DOI":"10.18653\/v1\/2022.acl-long.197"},{"key":"3_CR22","unstructured":"Kendrick, C., Cassidy, B., Pappachan, J.M., O\u2019Shea, C., Fernandez, C.J., Chacko, E., Jacob, K., Reeves, N.D., Yap, M.H.: Translating clinical delineation of diabetic foot ulcers into machine-interpretable segmentation. arXiv preprint arXiv:2204.11618 (2022)"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S.: Maple: Multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 19113\u201319122 (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"3_CR24","unstructured":"Kwon, G., Cai, Z., Ravichandran, A., Bas, E., Bhotika, R., Soatto, S.: Masked vision and language modeling for multi-modal representation learning. In: The Eleventh International Conference on Learning Representations (2023)"},{"issue":"9","key":"3_CR25","doi-asserted-by":"publisher","first-page":"2198","DOI":"10.1109\/TMI.2019.2900516","volume":"38","author":"S Leclerc","year":"2019","unstructured":"Leclerc, S., Smistad, E., Pedrosa, J., \u00d8stvik, A., Cervenansky, F., Espinosa, F., Espeland, T., Berg, E.A.R., Jodoin, P.M., Grenier, T., et al.: Deep learning for segmentation using an open large-scale dataset in 2d echocardiography. IEEE Trans. Med. Imaging 38(9), 2198\u20132210 (2019)","journal-title":"IEEE Trans. Med. Imaging"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing. pp. 3045\u20133059 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"3_CR27","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: Optimizing continuous prompts for generation. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). pp. 4582\u20134597 (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"3_CR28","unstructured":"Li, Y., Liang, F., Zhao, L., Cui, Y., Ouyang, W., Shao, J., Yu, F., Yan, J.: Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. In: International Conference on Learning Representations (2021)"},{"issue":"9","key":"3_CR29","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3560815","volume":"55","author":"P Liu","year":"2023","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., Neubig, G.: Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 55(9), 1\u201335 (2023)","journal-title":"ACM Comput. Surv."},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"Liu, X., Ji, K., Fu, Y., Tam, W., Du, Z., Yang, Z., Tang, J.: P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers). pp. 61\u201368 (2022)","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"3_CR31","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"L\u00fcddecke, T., Ecker, A.: Image segmentation using text and image prompts. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 7086\u20137096 (2022)","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"3_CR33","unstructured":"Van\u00a0der Maaten, L., Hinton, G.: Visualizing data using t-sne. Journal of machine learning research 9(11) (2008)"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"Milletari, F., Navab, N., Ahmadi, S.A.: V-net: Fully convolutional neural networks for volumetric medical image segmentation. In: 2016 Fourth International Conference on 3D Vision (3DV). pp. 565\u2013571. IEEE (2016)","DOI":"10.1109\/3DV.2016.79"},{"key":"3_CR35","doi-asserted-by":"crossref","unstructured":"Ngoc\u00a0Lan, P., An, N.S., Hang, D.V., Long, D.V., Trung, T.Q., Thuy, N.T., Sang, D.V.: NeoUNet: Towards accurate colon polyp segmentation and neoplasm detection. In: Advances in Visual Computing. pp. 15\u201328. Springer (2021)","DOI":"10.1007\/978-3-030-90436-4_2"},{"key":"3_CR36","unstructured":"Poudel, K., Dhakal, M., Bhandari, P., Adhikari, R., Thapaliya, S., Khanal, B.: Exploring transfer learning in medical image segmentation using vision-language models. In: Medical Imaging with Deep Learning (2023)"},{"key":"3_CR37","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Rao, Y., Zhao, W., Chen, G., Tang, Y., Zhu, Z., Huang, G., Zhou, J., Lu, J.: Denseclip: Language-guided dense prediction with context-aware prompting. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 18082\u201318091 (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"3_CR39","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"issue":"10","key":"3_CR40","doi-asserted-by":"publisher","first-page":"867","DOI":"10.1038\/s42256-022-00536-x","volume":"4","author":"A Saporta","year":"2022","unstructured":"Saporta, A., Gui, X., Agrawal, A., Pareek, A., Truong, S.Q., Nguyen, C.D., Ngo, V.D., Seekins, J., Blankenberg, F.G., Ng, A.Y., et al.: Benchmarking saliency methods for chest x-ray interpretation. Nature Machine Intelligence 4(10), 867\u2013878 (2022)","journal-title":"Nature Machine Intelligence"},{"key":"3_CR41","doi-asserted-by":"crossref","unstructured":"Shin, T., Razeghi, Y., Logan\u00a0IV, R.L., Wallace, E., Singh, S.: Autoprompt: Eliciting knowledge from language models with automatically generated prompts. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). pp. 4222\u20134235 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.346"},{"key":"3_CR42","unstructured":"Shrestha, P., Amgain, S., Khanal, B., Linte, C.A., Bhattarai, B.: Medical vision language pretraining: A survey. arXiv preprint arXiv:2312.06224 (2023)"},{"key":"3_CR43","doi-asserted-by":"crossref","unstructured":"Singh, A., Hu, R., Goswami, V., Couairon, G., Galuba, W., Rohrbach, M., Kiela, D.: Flava: A foundational language and vision alignment model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 15638\u201315650 (2022)","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"3_CR44","first-page":"200","volume":"34","author":"M Tsimpoukelli","year":"2021","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. Adv. Neural. Inf. Process. Syst. 34, 200\u2013212 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR45","doi-asserted-by":"crossref","unstructured":"Wang, Z., Lu, Y., Li, Q., Tao, X., Guo, Y., Gong, M., Liu, T.: Cris: Clip-driven referring image segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 11686\u201311695 (2022)","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"3_CR46","doi-asserted-by":"crossref","unstructured":"Wang, Z., Zhang, Z., Ebrahimi, S., Sun, R., Zhang, H., Lee, C.Y., Ren, X., Su, G., Perot, V., Dy, J., et\u00a0al.: Dualprompt: Complementary prompting for rehearsal-free continual learning. In: European Conference on Computer Vision. pp. 631\u2013648. Springer (2022)","DOI":"10.1007\/978-3-031-19809-0_36"},{"key":"3_CR47","doi-asserted-by":"crossref","unstructured":"Wang, Z., Zhang, Z., Lee, C.Y., Zhang, H., Sun, R., Ren, X., Su, G., Perot, V., Dy, J., Pfister, T.: Learning to prompt for continual learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 139\u2013149 (2022)","DOI":"10.1109\/CVPR52688.2022.00024"},{"key":"3_CR48","unstructured":"Watanabe, S.: Tree-structured parzen estimator: Understanding its algorithm components and their roles for better empirical performance. arXiv preprint arXiv:2304.11127 (2023)"},{"key":"3_CR49","doi-asserted-by":"crossref","unstructured":"Wu, C.E., Tian, Y., Yu, H., Wang, H., Morgado, P., Hu, Y.H., Yang, L.: Why is prompt tuning for vision-language models robust to noisy labels? In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 15488\u201315497 (2023)","DOI":"10.1109\/ICCV51070.2023.01420"},{"key":"3_CR50","doi-asserted-by":"crossref","unstructured":"Wu, C., Lin, Z., Cohen, S., Bui, T., Maji, S.: Phrasecut: Language-based image segmentation in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10216\u201310225 (2020)","DOI":"10.1109\/CVPR42600.2020.01023"},{"key":"3_CR51","doi-asserted-by":"publisher","first-page":"30","DOI":"10.1016\/j.aiopen.2024.01.004","volume":"5","author":"Y Yao","year":"2024","unstructured":"Yao, Y., Zhang, A., Zhang, Z., Liu, Z., Chua, T.S., Sun, M.: CPT: Colorful prompt tuning for pre-trained vision-language models. AI Open 5, 30\u201338 (2024)","journal-title":"AI Open"},{"key":"3_CR52","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: Contrastive captioners are image-text foundation models. Transactions on Machine Learning Research (2022)"},{"key":"3_CR53","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Unified vision and language prompt learning. arXiv preprint arXiv:2210.07225 (2022)"},{"key":"3_CR54","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., Beyer, L.: Sigmoid loss for language image pre-training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 11975\u201311986 (2023)","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"3_CR55","doi-asserted-by":"crossref","unstructured":"Zhai, X., Wang, X., Mustafa, B., Steiner, A., Keysers, D., Kolesnikov, A., Beyer, L.: Lit: Zero-shot transfer with locked-image text tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18123\u201318133 (2022)","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"3_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Shen, Z., Jiao, R.: Segment anything model for medical image segmentation: Current applications and future directions. Computers in Biology and Medicine p. 108238 (2024)","DOI":"10.1016\/j.compbiomed.2024.108238"},{"key":"3_CR57","unstructured":"Zhang, Y., Zhou, K., Liu, Z.: Neural prompt search. arXiv preprint arXiv:2206.04673 (2022)"},{"key":"3_CR58","unstructured":"Zhang, Y., Jiang, H., Miura, Y., Manning, C.D., Langlotz, C.P.: Contrastive learning of medical visual representations from paired images and text. In: Machine Learning for Healthcare Conference. pp. 2\u201325. PMLR (2022)"},{"key":"3_CR59","unstructured":"Zhao, Z., Liu, Y., Wu, H., Li, Y., Wang, S., Teng, L., Liu, D., Li, X., Cui, Z., Wang, Q., et\u00a0al.: Clip in medical imaging: A comprehensive survey. arXiv preprint arXiv:2312.07353 (2023)"},{"key":"3_CR60","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Friedman, D., Chen, D.: Factual probing is [mask]: Learning vs. learning to recall. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. pp. 5017\u20135033 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.398"},{"key":"3_CR61","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 16816\u201316825 (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"3_CR62","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"3_CR63","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Lei, Y., Zhang, B., Liu, L., Liu, Y.: ZegCLIP: Towards adapting clip for zero-shot semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 11175\u201311185 (2023)","DOI":"10.1109\/CVPR52729.2023.01075"},{"key":"3_CR64","doi-asserted-by":"crossref","unstructured":"Zhu, B., Niu, Y., Han, Y., Wu, Y., Zhang, H.: Prompt-aligned gradient for prompt tuning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 15659\u201315669 (2023)","DOI":"10.1109\/ICCV51070.2023.01435"},{"key":"3_CR65","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: Enhancing vision-language understanding with advanced large language models. In: The Twelfth International Conference on Learning Representations (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0908-6_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T20:09:23Z","timestamp":1733515763000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0908-6_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,7]]},"ISBN":["9789819609079","9789819609086"],"references-count":65,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0908-6_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,7]]},"assertion":[{"value":"7 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}