{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T13:50:39Z","timestamp":1765547439424,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733374","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:04Z","timestamp":1750876264000},"page":"398-406","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["LLAUS: A High-Quality Instruction-Tuned Large Vision Language Assistant for UltraSound"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7907-0260","authenticated-orcid":false,"given":"Junhao","family":"Guo","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6011-0769","authenticated-orcid":false,"given":"XueFeng","family":"Shan","sequence":"additional","affiliation":[{"name":"Xinjiang Medical University, Xinjiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3131-6916","authenticated-orcid":false,"given":"Guoming","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4859-1757","authenticated-orcid":false,"given":"Dong","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5720-0941","authenticated-orcid":false,"given":"Rongxing","family":"Lu","sequence":"additional","affiliation":[{"name":"University of New Brunswick, Fredericton, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7356-9711","authenticated-orcid":false,"given":"Siliang","family":"Tang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"MISS: A Generative Pretraining and Finetuning Approach for Med-VQA. arXiv preprint arXiv:2401.05163","author":"Chen Jiawei","year":"2024","unstructured":"Jiawei Chen, Dingkang Yang, Yue Jiang, Yuxuan Lei, and Lihua Zhang. 2024b. MISS: A Generative Pretraining and Finetuning Approach for Med-VQA. arXiv preprint arXiv:2401.05163 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Jeya Maria Jose Valanarasu, Alaa Youssef, Joseph Paul Cohen, Eduardo Pontes Reis, et al.","author":"Chen Zhihong","year":"2024","unstructured":"Zhihong Chen, Maya Varma, Jean-Benoit Delbrouck, Magdalini Paschali, Louis Blankemeier, Dave Van Veen, Jeya Maria Jose Valanarasu, Alaa Youssef, Joseph Paul Cohen, Eduardo Pontes Reis, et al. 2024a. Chexagent: Towards a foundation model for chest x-ray interpretation. arXiv preprint arXiv:2401.12208 (2024)."},{"key":"e_1_3_2_1_4_1","unstructured":"Junlong Cheng Jin Ye Zhongying Deng Jianpin Chen Tianbin Li Haoyu Wang Yanzhou Su Ziyan Huang Jilong Chen Lei Jiang et al. 2023. Sam-med2d. arXiv preprint arXiv:2308.16184 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Gerard De Melo, and","author":"Eslami Sedigheh","year":"2023","unstructured":"Sedigheh Eslami, Christoph Meinel, Gerard De Melo, and. 2023. Pubmedclip: How much does clip benefit visual question answering in the medical domain?. In Findings of the Association for Computational Linguistics: EACL 2023. 1181--1193."},{"key":"e_1_3_2_1_6_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Fan Lijie","year":"2024","unstructured":"Lijie Fan, Dilip Krishnan, Phillip Isola, Dina Katabi, and Yonglong Tian. 2024. Improving clip training with language rewrites. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1148\/rg.234035034"},{"key":"e_1_3_2_1_8_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_9_1","volume-title":"BioMistral: A Collection of Open-Source Pretrained Large Language Models for Medical Domains. arXiv preprint arXiv:2402.10373","author":"Labrak Yanis","year":"2024","unstructured":"Yanis Labrak, Adrien Bazoge, Emmanuel Morin, Pierre-Antoine Gourraud, Mickael Rouvier, and Richard Dufour. 2024. BioMistral: A Collection of Open-Source Pretrained Large Language Models for Medical Domains. arXiv preprint arXiv:2402.10373 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Li Chunyuan","year":"2024","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2024a. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Li Chunyuan","year":"2024","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2024b. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_13_1","volume-title":"Pmc-clip: Contrastive language-image pre-training using biomedical documents. In MICCAI.","author":"Lin Weixiong","year":"2023","unstructured":"Weixiong Lin, Ziheng Zhao, Xiaoman Zhang, Chaoyi Wu, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2023. Pmc-clip: Contrastive language-image pre-training using biomedical documents. In MICCAI."},{"key":"e_1_3_2_1_14_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning."},{"key":"e_1_3_2_1_15_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_16_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv: 2103.00020 [cs.CV]"},{"key":"e_1_3_2_1_17_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models. arxiv: 2312.11805","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Rohan Anil, Sebastian Borgeaud, Jean-Baptiste Alayrac, and Jiahui Yu et al. 2024. Gemini: A Family of Highly Capable Multimodal Models. arxiv: 2312.11805"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_19_1","volume-title":"Huatuo: Tuning llama model with chinese medical knowledge. arXiv preprint arXiv:2304.06975","author":"Wang Haochun","year":"2023","unstructured":"Haochun Wang, Chi Liu, Nuwa Xi, Zewen Qiang, Sendong Zhao, Bing Qin, and Ting Liu. 2023. Huatuo: Tuning llama model with chinese medical knowledge. arXiv preprint arXiv:2304.06975 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Pmc-llama: Further finetuning llama on medical papers. arXiv preprint arXiv:2304.14454","author":"Wu Chaoyi","year":"2023","unstructured":"Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2023. Pmc-llama: Further finetuning llama on medical papers. arXiv preprint arXiv:2304.14454 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Doctorglm: Fine-tuning your chinese doctor is not a herculean task. arXiv preprint arXiv:2304.01097","author":"Xiong Honglin","year":"2023","unstructured":"Honglin Xiong, Sheng Wang, Yitao Zhu, Zihao Zhao, Yuxiao Liu, Linlin Huang, Qian Wang, and Dinggang Shen. 2023. Doctorglm: Fine-tuning your chinese doctor is not a herculean task. arXiv preprint arXiv:2304.01097 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Chatdoctor: A medical chat model fine-tuned on llama model using medical domain knowledge. arXiv preprint arXiv:2303.14070","author":"Yunxiang Li","year":"2023","unstructured":"Li Yunxiang, Li Zihan, Zhang Kai, Dan Ruilong, and Zhang You. 2023. Chatdoctor: A medical chat model fine-tuned on llama model using medical domain knowledge. arXiv preprint arXiv:2303.14070 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Biomedgpt: A unified and generalist biomedical generative pre-trained transformer for vision, language, and multimodal tasks. arXiv preprint arXiv:2305.17100","author":"Zhang Kai","year":"2023","unstructured":"Kai Zhang, Jun Yu, Zhiling Yan, Yixin Liu, Eashan Adhikarla, Sunyang Fu, Xun Chen, Chen Chen, Yuyin Zhou, Xiang Li, et al. 2023b. Biomedgpt: A unified and generalist biomedical generative pre-trained transformer for vision, language, and multimodal tasks. arXiv preprint arXiv:2305.17100 (2023)."},{"key":"e_1_3_2_1_24_1","unstructured":"Sheng Zhang Yanbo Xu Naoto Usuyama Hanwen Xu Jaspreet Bagga Robert Tinn Sam Preston Rajesh Rao Mu Wei Naveen Valluri et al. 2023a. BiomedCLIP: a multimodal biomedical foundation model pretrained from fifteen million scientific image-text pairs. arXiv preprint arXiv:2303.00915 (2023)."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733374","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:04:30Z","timestamp":1755749070000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733374"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":24,"alternative-id":["10.1145\/3731715.3733374","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733374","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}