{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:50:49Z","timestamp":1761094249738,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3728424.3760770","type":"proceedings-article","created":{"date-parts":[[2025,10,21]],"date-time":"2025-10-21T15:17:42Z","timestamp":1761059862000},"page":"45-51","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning Joint Text and Visual Tokens in CLIP for Medical Image Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1171-5672","authenticated-orcid":false,"given":"Raffaele","family":"Mineo","sequence":"first","affiliation":[{"name":"University of Catania, Catania, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1333-8348","authenticated-orcid":false,"given":"Giovanni","family":"Bellitto","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6122-4249","authenticated-orcid":false,"given":"Federica","family":"Proietto Salanitri","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9492-5954","authenticated-orcid":false,"given":"Rutger","family":"Hendrix","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6653-2577","authenticated-orcid":false,"given":"Concetto","family":"Spampinato","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2441-0982","authenticated-orcid":false,"given":"Simone","family":"Palazzo","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}]}],"member":"320","published-online":{"date-parts":[[2025,10,26]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"European conference on computer vision. Springer, 1-21","author":"Boecking Benedikt","year":"2022","unstructured":"Benedikt Boecking, Naoto Usuyama, Shruthi Bannur, Daniel C Castro, Anton Schwaighofer, Stephanie Hyland, Maria Wetscherek, Tristan Naumann, Aditya Nori, Javier Alvarez-Valle, et al., 2022. Making the most of text semantics to improve biomedical vision-language processing. In European conference on computer vision. Springer, 1-21."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","first-page":"1820","DOI":"10.1109\/LSP.2024.3420083","article-title":"Vision-language consistency guided multi-modal prompt learning for blind ai generated image quality assessment","volume":"31","author":"Fu Jun","year":"2024","unstructured":"Jun Fu, Wei Zhou, Qiuping Jiang, Hantao Liu, and Guangtao Zhai. 2024. Vision-language consistency guided multi-modal prompt learning for blind ai generated image quality assessment. IEEE Signal Processing Letters, Vol. 31 (2024), 1820-1824.","journal-title":"IEEE Signal Processing Letters"},{"key":"e_1_3_2_1_3_1","unstructured":"Juncen Guo Yang Liu Xiaoguang Zhu Lianlong Sun Liangyu Teng Jingyi Wu Di Li Linxiao Gong Weiwei Jiang Wei Zhou and Liang Song. 2025. CalFuse: Feature Calibration Enhanced Parameter Fusion for Class-Continual Learning."},{"key":"e_1_3_2_1_4_1","volume-title":"International conference on machine learning. PMLR, 2790-2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International conference on machine learning. PMLR, 2790-2799."},{"key":"e_1_3_2_1_5_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. Lora: Low-rank adaptation of large language models. ICLR, Vol. 1, 2 (2022), 3.","journal-title":"ICLR"},{"key":"e_1_3_2_1_6_1","volume-title":"International conference on machine learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00324"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_10_1","volume-title":"Quantifying the Carbon Emissions of Machine Learning. arXiv preprint arXiv:1910.09700","author":"Lacoste Alexandre","year":"2019","unstructured":"Alexandre Lacoste, Alexandra Luccioni, Victor Schmidt, and Thomas Dandres. 2019. Quantifying the Carbon Emissions of Machine Learning. arXiv preprint arXiv:1910.09700 (2019)."},{"key":"e_1_3_2_1_11_1","volume-title":"The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691","author":"Lester Brian","year":"2021","unstructured":"Brian Lester, Rami Al-Rfou, and Noah Constant. 2021. The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)."},{"key":"e_1_3_2_1_12_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_13_1","volume-title":"Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190","author":"Li Xiang Lisa","year":"2021","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)."},{"key":"e_1_3_2_1_14_1","volume-title":"Zhengxiao Du, Zhilin Yang, and Jie Tang.","author":"Liu Xiao","year":"2021","unstructured":"Xiao Liu, Kaixuan Ji, Yicheng Fu, Weng Lam Tam, Zhengxiao Du, Zhilin Yang, and Jie Tang. 2021. P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602 (2021)."},{"key":"e_1_3_2_1_15_1","volume-title":"Tri Cao, Binh Nguyen, Paul Swoboda, Nhat Ho, Shadi Albarqouni, Pengtao Xie, et al.","author":"Nguyen Duy MH","year":"2024","unstructured":"Duy MH Nguyen, Hoang Nguyen, Nghiem Diep, Tan Ngoc Pham, Tri Cao, Binh Nguyen, Paul Swoboda, Nhat Ho, Shadi Albarqouni, Pengtao Xie, et al., 2024. Lvm-med: Learning large-scale self-supervised vision models for medical imaging via second-order graph matching. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_18_1","volume-title":"Eric Wallace, and Sameer Singh.","author":"Shin Taylor","year":"2020","unstructured":"Taylor Shin, Yasaman Razeghi, Robert L Logan IV, Eric Wallace, and Sameer Singh. 2020. Autoprompt: Eliciting knowledge from language models with automatically generated prompts. arXiv preprint arXiv:2010.15980 (2020)."},{"key":"e_1_3_2_1_19_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, ?ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 101-111","author":"Wang Zhao","year":"2023","unstructured":"Zhao Wang, Chang Liu, Shaoting Zhang, and Qi Dou. 2023. Foundation model for endoscopy video analysis via large-scale self-supervised pre-train. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 101-111."},{"key":"e_1_3_2_1_21_1","volume-title":"Zihang Dai, Yulia Tsvetkov, and Yuan Cao.","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021. Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904 (2021)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_36"},{"key":"e_1_3_2_1_23_1","volume-title":"Kaleb E Smith, Christopher Parisien, Colin Compas, Cheryl Martin, Mona G Flores, Ying Zhang, et al.","author":"Yang Xi","year":"2022","unstructured":"Xi Yang, Aokun Chen, Nima PourNejatian, Hoo Chang Shin, Kaleb E Smith, Christopher Parisien, Colin Compas, Cheryl Martin, Mona G Flores, Ying Zhang, et al., 2022. Gatortron: A large clinical language model to unlock patient information from unstructured electronic health records. arXiv preprint arXiv:2203.03540 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_25_1","volume-title":"Parameterized Diffusion Optimization enabled Autoregressive Ordinal Regression for Diabetic Retinopathy Grading. arXiv preprint arXiv:2507.04978","author":"Yu Qinkai","year":"2025","unstructured":"Qinkai Yu, Wei Zhou, Hantao Liu, Yanyu Xu, Meng Wang, Yitian Zhao, Huazhu Fu, Xujiong Ye, Yalin Zheng, and Yanda Meng. 2025. Parameterized Diffusion Optimization enabled Autoregressive Ordinal Regression for Diabetic Retinopathy Grading. arXiv preprint arXiv:2507.04978 (2025)."},{"key":"e_1_3_2_1_26_1","volume-title":"Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432","author":"Yuan Lu","year":"2021","unstructured":"Lu Yuan, Dongdong Chen, Yi-Ling Chen, Noel Codella, Xiyang Dai, Jianfeng Gao, Houdong Hu, Xuedong Huang, Boxin Li, Chunyuan Li, et al., 2021. Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","first-page":"954","DOI":"10.1109\/LSP.2024.3378106","article-title":"Boundary refinement network for colorectal polyp segmentation in colonoscopy images","volume":"31","author":"Yue Guanghui","year":"2024","unstructured":"Guanghui Yue, Yuanyan Li, Wenchao Jiang, Wei Zhou, and Tianwei Zhou. 2024a. Boundary refinement network for colorectal polyp segmentation in colonoscopy images. IEEE Signal Processing Letters, Vol. 31 (2024), 954-958.","journal-title":"IEEE Signal Processing Letters"},{"key":"e_1_3_2_1_28_1","volume-title":"Subjective and objective quality assessment of colonoscopy videos","author":"Yue Guanghui","year":"2024","unstructured":"Guanghui Yue, Lixin Zhang, Jingfeng Du, Tianwei Zhou, Wei Zhou, and Weisi Lin. 2024b. Subjective and objective quality assessment of colonoscopy videos. IEEE Transactions on Medical Imaging (2024)."},{"key":"e_1_3_2_1_29_1","volume-title":"2023 IEEE International Conference on Image Processing (ICIP). IEEE, 3005-3009","author":"Yue Guanghui","year":"2023","unstructured":"Guanghui Yue, Shaoping Zhang, Yuan Li, Xiaoyan Zhou, Tianwei Zhou, and Wei Zhou. 2023. Subjective quality assessment of enhanced retinal images. In 2023 IEEE International Conference on Image Processing (ICIP). IEEE, 3005-3009."},{"key":"e_1_3_2_1_30_1","volume-title":"Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199","author":"Zaken Elad Ben","year":"2021","unstructured":"Elad Ben Zaken, Shauli Ravfogel, and Yoav Goldberg. 2021. Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199 (2021)."},{"key":"e_1_3_2_1_31_1","volume-title":"2025 IEEE International Symposium on Circuits and Systems (ISCAS). IEEE, 1-5.","author":"Zeng Yirui","year":"2025","unstructured":"Yirui Zeng, Jun Fu, Hadi Amirpour, Huasheng Wang, Guanghui Yue, Hantao Liu, Ying Chen, and Wei Zhou. 2025. Clip-dqa: Blindly evaluating dehazed images from global and local perspectives using clip. In 2025 IEEE International Symposium on Circuits and Systems (ISCAS). IEEE, 1-5."},{"key":"e_1_3_2_1_32_1","unstructured":"Sheng Zhang Yanbo Xu Naoto Usuyama Hanwen Xu Jaspreet Bagga Robert Tinn Sam Preston Rajesh Rao Mu Wei Naveen Valluri et al. 2023b. Biomedclip: a multimodal biomedical foundation model pretrained from fifteen million scientific image-text pairs. arXiv preprint arXiv:2303.00915 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 272-282","author":"Zhang Yunkun","year":"2023","unstructured":"Yunkun Zhang, Jin Gao, Mu Zhou, Xiaosong Wang, Yu Qiao, Shaoting Zhang, and Dequan Wang. 2023a. Text-guided foundation model adaptation for pathological image classification. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 272-282."},{"key":"e_1_3_2_1_34_1","volume-title":"Machine Learning for Healthcare Conference. PMLR, 2-25","author":"Zhang Yuhao","year":"2022","unstructured":"Yuhao Zhang, Hang Jiang, Yasuhide Miura, Christopher D Manning, and Curtis P Langlotz. 2022. Contrastive learning of medical visual representations from paired images and text. In Machine Learning for Healthcare Conference. PMLR, 2-25."},{"key":"e_1_3_2_1_35_1","volume-title":"Factual probing is [mask]: Learning vs. learning to recall. arXiv preprint arXiv:2104.05240","author":"Zhong Zexuan","year":"2021","unstructured":"Zexuan Zhong, Dan Friedman, and Danqi Chen. 2021. Factual probing is [mask]: Learning vs. learning to recall. arXiv preprint arXiv:2104.05240 (2021)."},{"key":"e_1_3_2_1_36_1","volume-title":"Chen Change Loy, and Ziwei Liu","author":"Zhou Kaiyang","year":"2022","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen Change Loy, and Ziwei Liu. 2022a. Cocoop: Conditional prompt learning for vision-language models. arXiv preprint arXiv:2203.05557 (2022)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '25:The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2nd International Workshop on Multimedia Computing for Health and Medicine"],"original-title":[],"deposited":{"date-parts":[[2025,10,21]],"date-time":"2025-10-21T15:18:23Z","timestamp":1761059903000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3728424.3760770"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,26]]},"references-count":37,"alternative-id":["10.1145\/3728424.3760770","10.1145\/3728424"],"URL":"https:\/\/doi.org\/10.1145\/3728424.3760770","relation":{},"subject":[],"published":{"date-parts":[[2025,10,26]]},"assertion":[{"value":"2025-10-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}