{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T14:10:30Z","timestamp":1773843030093,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T00:00:00Z","timestamp":1742774400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The Science and Technology Program of Jinhua","award":["2023-3-001a"],"award-info":[{"award-number":["2023-3-001a"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62302432"],"award-info":[{"award-number":["62302432"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,24]]},"DOI":"10.1145\/3708359.3712098","type":"proceedings-article","created":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T12:50:34Z","timestamp":1742388634000},"page":"1287-1296","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["SkinGEN: an Explainable Dermatology Diagnosis-to-Generation Framework with Interactive Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5682-2140","authenticated-orcid":false,"given":"Bo","family":"Lin","sequence":"first","affiliation":[{"name":"Innovation Centre for Information, Binjiang Institute of Zhejiang University, Hangzhou, China and School of Software Technology, Zhejiang University, Ningbo, China,"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5856-1972","authenticated-orcid":false,"given":"Yingjing","family":"Xu","sequence":"additional","affiliation":[{"name":"Zhejiang University, School of Software, Ningbo, China,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6232-3783","authenticated-orcid":false,"given":"Xuanwen","family":"Bao","sequence":"additional","affiliation":[{"name":"The First Affiliated Hospital of Zhejiang University School of Medicine, Hangzhou, Zhejiang Province, China,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6121-0384","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China,"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8552-6174","authenticated-orcid":false,"given":"Zhouyang","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Design, Hunan University, Changsha, China,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4703-7348","authenticated-orcid":false,"given":"Jianwei","family":"Yin","sequence":"additional","affiliation":[{"name":"Center for Data Science, Zhejiang University, Hangzhou, China,"}]}],"member":"320","published-online":{"date-parts":[[2025,3,24]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Abubakar Abid Ali Abdalla Ali Abid Dawood Khan Abdulrahman Alfozan and James Zou. 2019. Gradio: Hassle-Free Sharing and Testing of ML Models in the Wild. arxiv:https:\/\/arXiv.org\/abs\/1906.02569\u00a0[cs.LG]"},{"key":"e_1_3_3_2_3_2","first-page":"99","volume-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention","author":"Akrout Mohamed","year":"2023","unstructured":"Mohamed Akrout, B\u00e1lint Gyepesi, P\u00e9ter Holl\u00f3, Adrienn Po\u00f3r, Bl\u00e1ga Kincs\u0151, Stephen Solis, Katrina Cirone, Jeremy Kawahara, Dekker Slade, Latif Abid, et\u00a0al. 2023. Diffusion-based data augmentation for skin disease classification: Impact across original medical datasets to fully synthetic images. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 99\u2013109."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00305"},{"key":"e_1_3_3_2_5_2","unstructured":"Anas Awadalla Irena Gao Josh Gardner Jack Hessel Yusuf Hanafy Wanrong Zhu Kalyani Marathe Yonatan Bitton Samir Gadre Shiori Sagawa Jenia Jitsev Simon Kornblith Pang\u00a0Wei Koh Gabriel Ilharco Mitchell Wortsman and Ludwig Schmidt. 2023. OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models. arxiv:https:\/\/arXiv.org\/abs\/2308.01390\u00a0[cs.CV]"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1017\/9781108955652"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"William Castillo-Gonz\u00e1lez Carlos\u00a0Oscar Lepez and Mabel\u00a0Cecilia Bonardi. 2022. Chat GPT: a promising tool for academic editing. Data and Metadata 1 (2022) 23\u201323.","DOI":"10.56294\/dm202223"},{"key":"e_1_3_3_2_8_2","first-page":"3909","volume-title":"Conference on Robot Learning","author":"Chebotar Yevgen","year":"2023","unstructured":"Yevgen Chebotar, Quan Vuong, Karol Hausman, Fei Xia, Yao Lu, Alex Irpan, Aviral Kumar, Tianhe Yu, Alexander Herzog, Karl Pertsch, et\u00a0al. 2023. Q-transformer: Scalable offline reinforcement learning via autoregressive q-functions. In Conference on Robot Learning. PMLR, 3909\u20133928."},{"key":"e_1_3_3_2_9_2","unstructured":"Mark Chen Jerry Tworek Heewoo Jun Qiming Yuan Henrique Ponde de\u00a0Oliveira Pinto Jared Kaplan Harri Edwards Yuri Burda Nicholas Joseph Greg Brockman et\u00a0al. 2021. Evaluating large language models trained on code. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2107.03374 (2021)."},{"key":"e_1_3_3_2_10_2","unstructured":"Wenliang Dai Junnan Li Dongxu Li Anthony Meng\u00a0Huat Tiong Junqi Zhao Weisheng Wang Boyang Li Pascale Fung and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arxiv:https:\/\/arXiv.org\/abs\/2305.06500\u00a0[cs.CV]"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Onat Dalmaz Mahmut Yurt and Tolga \u00c7ukur. 2022. ResViT: residual vision transformers for multimodal medical image synthesis. IEEE Transactions on Medical Imaging 41 10 (2022) 2598\u20132614.","DOI":"10.1109\/TMI.2022.3167808"},{"key":"e_1_3_3_2_12_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et\u00a0al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020)."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-eacl.88"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Muhammad\u00a0Ali Farooq Wang Yao Michael Schukat Mark\u00a0A Little and Peter Corcoran. 2024. Derm-T2IM: Harnessing Synthetic Skin Lesion Data via Stable Diffusion Models for Enhanced Skin Disease Classification using ViT and CNN. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.05159 (2024).","DOI":"10.1109\/EMBC53108.2024.10781852"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"C Flohr and RJBJoD Hay. 2021. Putting the burden of skin diseases on the global map. 189\u2013190\u00a0pages.","DOI":"10.1111\/bjd.19704"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00201"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Iryna Hartsock and Ghulam Rasool. 2024. Vision-Language Models for Medical Report Generation and Visual Question Answering: A Review. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.02469 (2024).","DOI":"10.3389\/frai.2024.1430984"},{"key":"e_1_3_3_2_18_2","unstructured":"Edward\u00a0J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.09685 (2021)."},{"key":"e_1_3_3_2_19_2","unstructured":"Mingzhe Hu Yuheng Li and Xiaofeng Yang. 2023. SkinSAM: Empowering Skin Cancer Segmentation with Segment Anything Model. arxiv:https:\/\/arXiv.org\/abs\/2304.13973\u00a0[cs.CV]"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander\u00a0C. Berg Wan-Yen Lo Piotr Doll\u00e1r and Ross Girshick. 2023. Segment Anything. arXiv:https:\/\/arXiv.org\/abs\/2304.02643 (2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_3_2_22_2","unstructured":"Bo Li Yuanhan Zhang Liangyu Chen Jinghao Wang Jingkang Yang and Ziwei Liu. 2023. Otter: A Multi-Modal Model with In-Context Instruction Tuning. arxiv:https:\/\/arXiv.org\/abs\/2305.03726\u00a0[cs.CV]"},{"key":"e_1_3_3_2_23_2","unstructured":"Chunyuan Li Cliff Wong Sheng Zhang Naoto Usuyama Haotian Liu Jianwei Yang Tristan Naumann Hoifung Poon and Jianfeng Gao. 2024. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_24_2","first-page":"19730","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730\u201319742."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Wei Li Shiping Wen Kaibo Shi Yin Yang and Tingwen Huang. 2022. Neural architecture search with a lightweight transformer for text-to-image synthesis. IEEE Transactions on Network Science and Engineering 9 3 (2022) 1567\u20131576.","DOI":"10.1109\/TNSE.2022.3147787"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Jiamin Liang Xin Yang Yuhao Huang Haoming Li Shuangchi He Xindi Hu Zejian Chen Wufeng Xue Jun Cheng and Dong Ni. 2022. Sketch guided and progressive growing GAN for realistic and editable ultrasound image synthesis. Medical image analysis 79 (2022) 102461.","DOI":"10.1016\/j.media.2022.102461"},{"key":"e_1_3_3_2_27_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual Instruction Tuning. arxiv:https:\/\/arXiv.org\/abs\/2304.08485\u00a0[cs.CV]"},{"key":"e_1_3_3_2_28_2","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chunyuan Li Jianwei Yang Hang Su Jun Zhu et\u00a0al. 2023. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.05499 (2023)."},{"key":"e_1_3_3_2_29_2","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 (2017)."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Jong\u00a0Hak Moon Hyungyung Lee Woncheol Shin Young-Hak Kim and Edward Choi. 2022. Multi-modal understanding and generation for medical images and text via vision-language pre-training. IEEE Journal of Biomedical and Health Informatics 26 12 (2022) 6070\u20136080.","DOI":"10.1109\/JBHI.2022.3207502"},{"key":"e_1_3_3_2_31_2","first-page":"353","volume-title":"Machine Learning for Health (ML4H)","author":"Moor Michael","year":"2023","unstructured":"Michael Moor, Qian Huang, Shirley Wu, Michihiro Yasunaga, Yash Dalmia, Jure Leskovec, Cyril Zakka, Eduardo\u00a0Pontes Reis, and Pranav Rajpurkar. 2023. Med-flamingo: a multimodal medical few-shot learner. In Machine Learning for Health (ML4H). PMLR, 353\u2013367."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Annamaria Offidani Oriana Simonetti Maria\u00a0Luisa Bernardini Ayhan Alpagut Andreina Cellini and Guido Bossi. 2002. General practitioners\u2019 accuracy in diagnosing skin cancers. Dermatology 205 2 (2002) 127\u2013130.","DOI":"10.1159\/000063887"},{"key":"e_1_3_3_2_33_2","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et\u00a0al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.07193 (2023)."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/1111449.1111475"},{"key":"e_1_3_3_2_35_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv:https:\/\/arXiv.org\/abs\/2103.00020\u00a0[cs.CV]"},{"key":"e_1_3_3_2_36_2","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 1 2 (2022) 3."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Adrit Rao and Oliver Aalami. 2023. Towards improving the visual explainability of artificial intelligence in the clinical setting. BMC Digital Health 1 1 (2023) 23.","DOI":"10.1186\/s44247-023-00022-3"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2112.10752\u00a0[cs.CV]","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_3_2_41_2","unstructured":"Christoph Schuhmann Romain Beaumont Richard Vencu Cade Gordon Ross Wightman Mehdi Cherti Theo Coombes Aarush Katta Clayton Mullis Mitchell Wortsman Patrick Schramowski Srivatsa Kundurthy Katherine Crowson Ludwig Schmidt Robert Kaczmarczyk and Jenia Jitsev. 2022. LAION-5B: An open large-scale dataset for training next generation image-text models. arxiv:https:\/\/arXiv.org\/abs\/2210.08402\u00a0[cs.CV]"},{"key":"e_1_3_3_2_42_2","volume-title":"IUI Workshops","author":"Stumpf Simone","year":"2018","unstructured":"Simone Stumpf, Simonas Skrebe, Graeme Aymer, and Julie Hobson. 2018. Explaining smart heating systems to discourage fiddling with optimized behavior. In IUI Workshops."},{"key":"e_1_3_3_2_43_2","unstructured":"Rohan Taori Ishaan Gulrajani Tianyi Zhang Yann Dubois Xuechen Li Carlos Guestrin Percy Liang and Tatsunori\u00a0B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_3_2_44_2","unstructured":"Omkar Thawkar Abdelrahman Shaker Sahal\u00a0Shaji Mullappilly Hisham Cholakkal Rao\u00a0Muhammad Anwer Salman Khan Jorma Laaksonen and Fahad\u00a0Shahbaz Khan. 2023. Xraygpt: Chest radiographs summarization using medical vision-language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.07971 (2023)."},{"key":"e_1_3_3_2_45_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv:https:\/\/arXiv.org\/abs\/2302.13971\u00a0[cs.CL]"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"crossref","unstructured":"Philipp Tschandl Cliff Rosendahl and Harald Kittler. 2018. The HAM10000 dataset a large collection of multi-source dermatoscopic images of common pigmented skin lesions. Scientific data 5 1 (2018) 1\u20139.","DOI":"10.1038\/sdata.2018.161"},{"key":"e_1_3_3_2_47_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_48_2","unstructured":"Weihan Wang Qingsong Lv Wenmeng Yu Wenyi Hong Ji Qi Yan Wang Junhui Ji Zhuoyi Yang Lei Zhao Xixuan Song et\u00a0al. 2023. Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.03079 (2023)."},{"key":"e_1_3_3_2_49_2","unstructured":"Abbi Ward Jimmy Li Julie Wang Sriram Lakshminarasimhan Ashley Carrick Bilson Campana Jay Hartford Pradeep\u00a0Kumar S Tiya Tiyasirichokchai Sunny Virmani Renee Wong Yossi Matias Greg\u00a0S. Corrado Dale\u00a0R. Webster Dawn Siegel Steven Lin Justin Ko Alan Karthikesalingam Christopher Semturs and Pooja Rao. 2024. Crowdsourcing Dermatology Images with Google Search Ads: Creating a Real-World Skin Condition Dataset. arxiv:https:\/\/arXiv.org\/abs\/2402.18545\u00a0[cs.CY]"},{"key":"e_1_3_3_2_50_2","unstructured":"Jerry Wei Jason Wei Yi Tay Dustin Tran Albert Webson Yifeng Lu Xinyun Chen Hanxiao Liu Da Huang Denny Zhou and Tengyu Ma. 2023. Larger language models do in-context learning differently. arxiv:https:\/\/arXiv.org\/abs\/2303.03846\u00a0[cs.CL]"},{"key":"e_1_3_3_2_51_2","unstructured":"Sean Welleck Jiacheng Liu Ximing Lu Hannaneh Hajishirzi and Yejin Choi. 2022. Naturalprover: Grounded mathematical proof generation with language models. Advances in Neural Information Processing Systems 35 (2022) 4913\u20134927."},{"key":"e_1_3_3_2_52_2","unstructured":"Xuansheng Wu Haiyan Zhao Yaochen Zhu Yucheng Shi Fan Yang Tianming Liu Xiaoming Zhai Wenlin Yao Jundong Li Mengnan Du et\u00a0al. 2024. Usable xai: 10 strategies towards exploiting explainability in the llm era. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.08946 (2024)."},{"key":"e_1_3_3_2_53_2","unstructured":"Hu Ye Jun Zhang Sibo Liu Xiao Han and Wei Yang. 2023. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.06721 (2023)."},{"key":"e_1_3_3_2_54_2","unstructured":"Zhengqing Yuan Huiwen Xue Xinyi Wang Yongming Liu Zhuanzhe Zhao and Kun Wang. 2023. ArtGPT-4: Artistic Vision-Language Understanding with Adapter-enhanced MiniGPT-4. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.07490 (2023)."},{"key":"e_1_3_3_2_55_2","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric.\u00a0P Xing Hao Zhang Joseph\u00a0E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. arxiv:https:\/\/arXiv.org\/abs\/2306.05685\u00a0[cs.CL]"},{"key":"e_1_3_3_2_56_2","unstructured":"Juexiao Zhou Xiaonan He Liyuan Sun Jiannan Xu Xiuying Chen Yuetan Chu Longxi Zhou Xingyu Liao Bin Zhang and Xin Gao. 2023. SkinGPT-4: an interactive dermatology diagnostic system with visual large language model. (2023)."},{"key":"e_1_3_3_2_57_2","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.10592 (2023)."}],"event":{"name":"IUI '25: 30th International Conference on Intelligent User Interfaces","location":"Cagliari Italy","acronym":"IUI '25","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 30th International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712098","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3708359.3712098","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:09:46Z","timestamp":1750295386000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712098"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,24]]},"references-count":56,"alternative-id":["10.1145\/3708359.3712098","10.1145\/3708359"],"URL":"https:\/\/doi.org\/10.1145\/3708359.3712098","relation":{},"subject":[],"published":{"date-parts":[[2025,3,24]]},"assertion":[{"value":"2025-03-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}