{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T15:02:23Z","timestamp":1782313343807,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","funder":[{"name":"NSFC","award":["62472102, 62372117, 62402120"],"award-info":[{"award-number":["62472102, 62372117, 62402120"]}]},{"DOI":"10.13039\/100007219","name":"Natural Science Foundation of Shanghai","doi-asserted-by":"publisher","award":["24ZR1490400"],"award-info":[{"award-number":["24ZR1490400"]}],"id":[{"id":"10.13039\/100007219","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755187","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"3769-3778","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["MM-Skin: Enhancing Dermatology Vision-Language Model with an Image-Text Dataset Derived from Textbooks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5535-4128","authenticated-orcid":false,"given":"Wenqi","family":"Zeng","sequence":"first","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7179-5045","authenticated-orcid":false,"given":"Yuqi","family":"Sun","sequence":"additional","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5577-5773","authenticated-orcid":false,"given":"Chenxi","family":"Ma","sequence":"additional","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7677-4772","authenticated-orcid":false,"given":"Weimin","family":"Tan","sequence":"additional","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0256-9682","authenticated-orcid":false,"given":"Bo","family":"Yan","sequence":"additional","affiliation":[{"name":"College of Computer Science and Artificial Intelligence, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Minigpt-med: Large language model as a general interface for radiology diagnosis. arXiv preprint arXiv:2407.04106","author":"Alkhaldi Asma","year":"2024","unstructured":"Asma Alkhaldi, Raneem Alnajim, Layan Alabdullatef, Rawan Alyahya, Jun Chen, Deyao Zhu, Ahmed Alsinan, and Mohamed Elhoseiny. 2024. Minigpt-med: Large language model as a general interface for radiology diagnosis. arXiv preprint arXiv:2407.04106 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arXiv:2308.12966 [cs.CV] https:\/\/arxiv.org\/abs\/2308.12966"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the CLEF 2021 Conference and Labs of the Evaluation Forum-working notes.","author":"Abacha Asma Ben","year":"2021","unstructured":"Asma Ben Abacha, Mourad Sarrouti, Dina Demner-Fushman, Sadid A Hasan, and Henning M\u00fcller. 2021. Overview of the vqa-med task at imageclef 2021: Visual question answering and generation in the medical domain. In Proceedings of the CLEF 2021 Conference and Labs of the Evaluation Forum-working notes. 21-24 September 2021."},{"key":"e_1_3_2_1_4_1","volume-title":"Eyegpt: Ophthalmic assistant with large language models. arXiv preprint arXiv:2403.00840","author":"Chen Xiaolan","year":"2024","unstructured":"Xiaolan Chen, Ziwei Zhao, Weiyi Zhang, Pusheng Xu, Le Gao, Mingpu Xu, Yue Wu, Yinwen Li, Danli Shi, and Mingguang He. 2024b. Eyegpt: Ophthalmic assistant with large language models. arXiv preprint arXiv:2403.00840 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-16443-9_65"},{"key":"e_1_3_2_1_6_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024a. Expanding performance boundaries of open-source multimodal models with model data and test-time scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_1_7_1","unstructured":"Noel Codella Veronica Rotemberg Philipp Tschandl M Emre Celebi Stephen Dusza David Gutman Brian Helba Aadi Kalloo Konstantinos Liopyris Michael Marchetti et al. 2019. Skin lesion analysis toward melanoma detection 2018: A challenge hosted by the international skin imaging collaboration (isic). arXiv preprint arXiv:1902.03368 (2019)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI.2018.8363547"},{"key":"e_1_3_2_1_9_1","volume-title":"Veronica Rotemberg, Brian Helba, Veronica Vilaplana, Ofer Reiter, Cristina Carrera, Alicia Barreiro, Allan C Halpern, Susana Puig, et al.","author":"Combalia Marc","year":"2019","unstructured":"Marc Combalia, Noel CF Codella, Veronica Rotemberg, Brian Helba, Veronica Vilaplana, Ofer Reiter, Cristina Carrera, Alicia Barreiro, Allan C Halpern, Susana Puig, et al., 2019. Bcn20000: Dermoscopic lesions in the wild. arXiv preprint arXiv:1908.02288 (2019)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Roxana Daneshjou Kailas Vodrahalli Roberto A Novoa Melissa Jenkins Weixin Liang Veronica Rotemberg Justin Ko Susan M Swetter Elizabeth E Bailey Olivier Gevaert et al. 2022a. Disparities in dermatology AI performance on a diverse curated clinical image set. Science advances Vol. 8 31 (2022) eabq6147.","DOI":"10.1126\/sciadv.abq6147"},{"key":"e_1_3_2_1_11_1","first-page":"18157","article-title":"Skincon: A skin disease dataset densely annotated by domain experts for fine-grained debugging and analysis","volume":"35","author":"Daneshjou Roxana","year":"2022","unstructured":"Roxana Daneshjou, Mert Yuksekgonul, Zhuo Ran Cai, Roberto Novoa, and James Y Zou. 2022b. Skincon: A skin disease dataset densely annotated by domain experts for fine-grained debugging and analysis. Advances in Neural Information Processing Systems, Vol. 35 (2022), 18157-18167.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","unstructured":"DermNet. [n.d.]. Dermnet. https:\/\/dermnet.com\/"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-eacl.88"},{"key":"e_1_3_2_1_14_1","volume-title":"Towards LLM-driven dialogue state tracking. arXiv preprint arXiv:2310.14970","author":"Feng Yujie","year":"2023","unstructured":"Yujie Feng, Zexin Lu, Bo Liu, Liming Zhan, and Xiao-Ming Wu. 2023. Towards LLM-driven dialogue state tracking. arXiv preprint arXiv:2310.14970 (2023)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00201"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27963"},{"key":"e_1_3_2_1_17_1","volume-title":"Pathvqa: 30000 questions for medical visual question answering. arXiv preprint arXiv:2003.10286","author":"He Xuehai","year":"2020","unstructured":"Xuehai He, Yichen Zhang, Luntian Mou, Eric Xing, and Pengtao Xie. 2020. Pathvqa: 30000 questions for medical visual question answering. arXiv preprint arXiv:2003.10286 (2020)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02093"},{"key":"e_1_3_2_1_19_1","unstructured":"AQ Jiang A Sablayrolles A Mensch C Bamford DS Chaplot D de Las Casas F Bressand G Lengyel G Lample L Saulnier et al. 2023. Mistral 7b. CoRR abs\/2310.06825 2023. doi: 10.48550. arXiv preprint ARXIV.2310.06825 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"a de-identified publicly available database of chest radiographs with free-text reports. Scientific data","author":"Johnson Alistair EW","year":"2019","unstructured":"Alistair EW Johnson, Tom J Pollard, Seth J Berkowitz, Nathaniel R Greenbaum, Matthew P Lungren, Chih-ying Deng, Roger G Mark, and Steven Horng. 2019. MIMIC-CXR, a de-identified publicly available database of chest radiographs with free-text reports. Scientific data, Vol. 6, 1 (2019), 317."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.3389\/fonc.2022.1022967"},{"key":"e_1_3_2_1_22_1","volume-title":"Asma Ben Abacha, and Dina Demner-Fushman","author":"Lau Jason J","year":"2018","unstructured":"Jason J Lau, Soumya Gayen, Asma Ben Abacha, and Dina Demner-Fushman. 2018. A dataset of clinically generated visual questions and answers about radiology images. Scientific data, Vol. 5, 1 (2018), 1-10."},{"key":"e_1_3_2_1_23_1","first-page":"28541","article-title":"Llava-med: Training a large language-and-vision assistant for biomedicine in one day","volume":"36","author":"Li Chunyuan","year":"2023","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2023. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems, Vol. 36 (2023), 28541-28564.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_24_1","volume-title":"M: A Large Vision-Language Model and A Comprehensive Multimodal Dataset Towards General Medical AI. arXiv preprint arXiv:2411.14522","author":"Li Tianbin","year":"2024","unstructured":"Tianbin Li, Yanzhou Su, Wei Li, Bin Fu, Zhe Chen, Ziyan Huang, Guoan Wang, Chenglong Ma, Ying Chen, Ming Hu, et al., 2024. GMAI-VL & GMAI-VL-5.5 M: A Large Vision-Language Model and A Comprehensive Multimodal Dataset Towards General Medical AI. arXiv preprint arXiv:2411.14522 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-43993-3_51"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-43993-3_51"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87196-3_20"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9434010"},{"key":"e_1_3_2_1_29_1","volume-title":"A Large-Scale, Groundable, and Explainable Medical VQA Benchmark for Chest X-ray Diagnosis. arXiv preprint arXiv:2411.16778","author":"Liu Bo","year":"2024","unstructured":"Bo Liu, Ke Zou, Liming Zhan, Zexin Lu, Xiaoyu Dong, Yidi Chen, Chengqiang Xie, Jiannong Cao, Xiao-Ming Wu, and Huazhu Fu. 2024c. GEMeX: A Large-Scale, Groundable, and Explainable Medical VQA Benchmark for Chest X-ray Diagnosis. arXiv preprint arXiv:2411.16778 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"PeFoMed: Parameter Efficient Fine-tuning of Multimodal Large Language Models for Medical Imaging. arXiv preprint arXiv:2401.02797","author":"Liu Gang","year":"2024","unstructured":"Gang Liu, Jinlong He, Pengfei Li, Genrong He, Zhaolin Chen, and Shenjun Zhong. 2024a. PeFoMed: Parameter Efficient Fine-tuning of Multimodal Large Language Models for Medical Imaging. arXiv preprint arXiv:2401.02797 (2024)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_32_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-024-07618-3"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/JBHI.2022.3207502"},{"key":"e_1_3_2_1_35_1","first-page":"353","article-title":"Med-flamingo: a multimodal medical few-shot learner. In Machine Learning for Health (ML4H)","author":"Moor Michael","year":"2023","unstructured":"Michael Moor, Qian Huang, Shirley Wu, Michihiro Yasunaga, Yash Dalmia, Jure Leskovec, Cyril Zakka, Eduardo Pontes Reis, and Pranav Rajpurkar. 2023. Med-flamingo: a multimodal medical few-shot learner. In Machine Learning for Health (ML4H). PMLR, 353-367.","journal-title":"PMLR"},{"key":"e_1_3_2_1_36_1","first-page":"522","volume-title":"Shenzhen","author":"Nguyen Binh D","year":"2019","unstructured":"Binh D Nguyen, Thanh-Toan Do, Binh X Nguyen, Tuong Do, Erman Tjiputra, and Quang D Tran. 2019. Overcoming data limitation in medical visual question answering. In Medical Image Computing and Computer Assisted Intervention-MICCAI 2019: 22nd International Conference, Shenzhen, China, October 13-17, 2019, Proceedings, Part IV 22. Springer, 522-530."},{"key":"e_1_3_2_1_37_1","volume-title":"Jos\u00e9 GM Esgario, Alana C Simora, Pedro BC Castro, et al.","author":"Pacheco Andre GC","year":"2020","unstructured":"Andre GC Pacheco, Gustavo R Lima, Amanda S Salomao, Breno Krohling, Igor P Biral, Gabriel G de Angelo, F\u00e1bio CR Alves Jr, Jos\u00e9 GM Esgario, Alana C Simora, Pedro BC Castro, et al., 2020. PAD-UFES-20: A skin lesion dataset composed of patient data and clinical images collected from smartphones. Data in brief, Vol. 32 (2020), 106221."},{"key":"e_1_3_2_1_38_1","volume-title":"International conference on machine learning. Pmlr, 8821-8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International conference on machine learning. Pmlr, 8821-8831."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-023-02630-0"},{"key":"e_1_3_2_1_40_1","unstructured":"Karan Singhal Tao Tu Juraj Gottweis Rory Sayres Ellery Wulczyn Mohamed Amin Le Hou Kevin Clark Stephen R Pfohl Heather Cole-Lewis et al. 2025. Toward expert-level medical question answering with large language models. Nature Medicine (2025) 1-8."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-16443-9_68"},{"key":"e_1_3_2_1_42_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.bionlp-1.35"},{"key":"e_1_3_2_1_44_1","volume-title":"Hisham Cholakkal, Rao Muhammad Anwer, Salman Khan, Jorma Laaksonen, and Fahad Shahbaz Khan.","author":"Thawkar Omkar","year":"2023","unstructured":"Omkar Thawkar, Abdelrahman Shaker, Sahal Shaji Mullappilly, Hisham Cholakkal, Rao Muhammad Anwer, Salman Khan, Jorma Laaksonen, and Fahad Shahbaz Khan. 2023. Xraygpt: Chest radiographs summarization using medical vision-language models. arXiv preprint arXiv:2306.07971 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al., 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"The HAM10000 dataset, a large collection of multi-source dermatoscopic images of common pigmented skin lesions. Scientific data","author":"Tschandl Philipp","year":"2018","unstructured":"Philipp Tschandl, Cliff Rosendahl, and Harald Kittler. 2018. The HAM10000 dataset, a large collection of multi-source dermatoscopic images of common pigmented skin lesions. Scientific data, Vol. 5, 1 (2018), 1-9."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1001\/jamanetworkopen.2024.46615"},{"key":"e_1_3_2_1_48_1","volume-title":"MM-Retinal: Knowledge-Enhanced Foundational Pretraining with Fundus Image-Text Expertise. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 722-732","author":"Wu Ruiqi","year":"2024","unstructured":"Ruiqi Wu, Chenran Zhang, Jianle Zhang, Yi Zhou, Tao Zhou, and Huazhu Fu. 2024. MM-Retinal: Knowledge-Enhanced Foundational Pretraining with Fundus Image-Text Expertise. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 722-732."},{"key":"e_1_3_2_1_49_1","volume-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 209-219","author":"Fu Yujuan","year":"2024","unstructured":"Wen-wai Yim, Yujuan Fu, Zhaoyi Sun, Asma Ben Abacha, Meliha Yetisgen, and Fei Xia. 2024. Dermavqa: A multilingual visual question answering dataset for dermatology. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 209-219."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Hongbo Zhang Junying Chen Feng Jiang Fei Yu Zhihong Chen Jianquan Li Guiming Chen Xiangbo Wu Zhiyi Zhang Qingying Xiao et al. 2023a. Huatuogpt towards taming language model to be a doctor. arXiv preprint arXiv:2305.15075 (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.725"},{"key":"e_1_3_2_1_51_1","unstructured":"Sheng Zhang Yanbo Xu Naoto Usuyama Jaspreet Bagga Robert Tinn Sam Preston Rajesh Rao Mu Wei Naveen Valluri Cliff Wong et al. 2023c. Large-scale domain-specific pretraining for biomedical vision-language processing. arXiv preprint arXiv:2303.00915 Vol. 2 3 (2023) 6."},{"key":"e_1_3_2_1_52_1","volume-title":"Radgenome-chest ct: A grounded vision-language dataset for chest ct analysis. arXiv preprint arXiv:2404.16754","author":"Zhang Xiaoman","year":"2024","unstructured":"Xiaoman Zhang, Chaoyi Wu, Ziheng Zhao, Jiayu Lei, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2024. Radgenome-chest ct: A grounded vision-language dataset for chest ct analysis. arXiv preprint arXiv:2404.16754 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Pmc-vqa: Visual instruction tuning for medical visual question answering. arXiv preprint arXiv:2305.10415","author":"Zhang Xiaoman","year":"2023","unstructured":"Xiaoman Zhang, Chaoyi Wu, Ziheng Zhao, Weixiong Lin, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2023b. Pmc-vqa: Visual instruction tuning for medical visual question answering. arXiv preprint arXiv:2305.10415 (2023)."},{"key":"e_1_3_2_1_54_1","volume-title":"Easygen: Easing multimodal generation with bidiffuser and llms. arXiv preprint arXiv:2310.08949","author":"Zhao Xiangyu","year":"2023","unstructured":"Xiangyu Zhao, Bo Liu, Qijiong Liu, Guangyuan Shi, and Xiao-Ming Wu. 2023. Easygen: Easing multimodal generation with bidiffuser and llms. arXiv preprint arXiv:2310.08949 (2023)."},{"key":"e_1_3_2_1_55_1","first-page":"46595","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","volume":"36","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al., 2023. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems, Vol. 36 (2023), 46595-46623.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-024-50043-3"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755187","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:02:24Z","timestamp":1765342944000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755187"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":56,"alternative-id":["10.1145\/3746027.3755187","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755187","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}