{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:47:24Z","timestamp":1755802044160,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","funder":[{"name":"National Education Science Planning Project","award":["Grant No. BIX230343"],"award-info":[{"award-number":["Grant No. BIX230343"]}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grant No. 62366036"],"award-info":[{"award-number":["Grant No. 62366036"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Program for Young Talents of Science and Technology in Universities of Inner Mongolia Autonomous Region","award":["Grant No. NJYT24033"],"award-info":[{"award-number":["Grant No. NJYT24033"]}]},{"name":"Science and Technology Program of the Joint Fund of Scientific Research for the Public Hospitals of Inner Mongolia Academy of Medical Sciences","award":["Grant No.2023GLLH0035"],"award-info":[{"award-number":["Grant No.2023GLLH0035"]}]},{"name":"Inner Mongolia Autonomous Region Science and Technology Planning Project","award":["Grant No. 2023YFSH0017"],"award-info":[{"award-number":["Grant No. 2023YFSH0017"]}]},{"name":"Key R&D and Achievement Transformation Program of Inner Mongolia Autonomous Region","award":["Grant No. 2022YFHH0077"],"award-info":[{"award-number":["Grant No. 2022YFHH0077"]}]},{"name":"Hohhot Science and Technology Project","award":["Grant No. 2023-Zhan-Zhong-1"],"award-info":[{"award-number":["Grant No. 2023-Zhan-Zhong-1"]}]},{"name":"The Reform and Development of Local Universities (Disciplinary Construction) and the Special Research Project of First-Class Discipline of Inner Mongolia","award":["Grant No. YLXKZX-ND-036"],"award-info":[{"award-number":["Grant No. YLXKZX-ND-036"]}]},{"name":"The Central Government Fund for Promoting Local Scientific and Technological Development","award":["Grant No. 2022ZY0198"],"award-info":[{"award-number":["Grant No. 2022ZY0198"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733384","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:04Z","timestamp":1750876264000},"page":"880-888","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MedVSA: Medical Visual Spoken-Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0962-724X","authenticated-orcid":false,"given":"Lei","family":"Liu","sequence":"first","affiliation":[{"name":"Inner Mongolia University, Huhhot, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8061-1474","authenticated-orcid":false,"given":"Xiangdong","family":"Su","sequence":"additional","affiliation":[{"name":"Inner Mongolia University, Huhhot, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5513-1192","authenticated-orcid":false,"given":"Guanglai","family":"Gao","sequence":"additional","affiliation":[{"name":"Inner Mongolia University, Huhhot, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"VQA-Med: Overview of the medical visual question answering task at ImageCLEF","author":"Abacha Asma Ben","year":"2019","unstructured":"Asma Ben Abacha, Sadid A Hasan, Vivek V Datla, Joey Liu, Dina Demner-Fushman, and Henning M\u00fcller. 2019. VQA-Med: Overview of the medical visual question answering task at ImageCLEF 2019. CLEF (working notes), Vol. 2, 6 (2019)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7179007"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-16443-9_65"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548122"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87240-3_7"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2022.3185008"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458754"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_10_1","volume-title":"Pathvqa: 30000 questions for medical visual question answering. arXiv preprint arXiv:2003.10286","author":"He Xuehai","year":"2020","unstructured":"Xuehai He, Yichen Zhang, Luntian Mou, Eric Xing, and Pengtao Xie. 2020. Pathvqa: 30000 questions for medical visual question answering. arXiv preprint arXiv:2003.10286 (2020)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00110"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531724"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01052"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9434063"},{"key":"e_1_3_2_1_15_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Kim Jin-Hwa","year":"2018","unstructured":"Jin-Hwa Kim, Jaehyun Jun, and Byoung-Tak Zhang. 2018. Bilinear attention networks. Advances in Neural Information Processing Systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_16_1","volume-title":"Asma Ben Abacha, and Dina Demner-Fushman","author":"Lau Jason J","year":"2018","unstructured":"Jason J Lau, Soumya Gayen, Asma Ben Abacha, and Dina Demner-Fushman. 2018. A dataset of clinically generated visual questions and answers about radiology images. Scientific data, Vol. 5, 1 (2018), 1--10."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-43907-0_36"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-43993-3_51"},{"volume-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention","author":"Liu Bo","key":"e_1_3_2_1_19_1","unstructured":"Bo Liu, Li-Ming Zhan, and Xiao-Ming Wu. 2021a. Contrastive Pre-training and Representation Distillation for Medical Visual Question Answering Based on Radiology Images. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 210--220."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9434010"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/BIBM55620.2022.9995347"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/BIBM62325.2024.10821918"},{"key":"e_1_3_2_1_23_1","volume-title":"Optimizing Transformer and MLP with Hidden States Perturbation for Medical Visual Question Answering. In 2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM). IEEE, 5515--5522","author":"Liu Lei","year":"2024","unstructured":"Lei Liu, Xiangdong Su, and Guanglai Gao. 2024b. Optimizing Transformer and MLP with Hidden States Perturbation for Medical Visual Question Answering. In 2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM). IEEE, 5515--5522."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR56361.2022.9956469"},{"key":"e_1_3_2_1_25_1","volume-title":"Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983","author":"Loshchilov Ilya","year":"2016","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-32251-9_57"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_29_1","volume-title":"An introduction to hidden Markov models. ieee assp magazine","author":"Rabiner Lawrence","year":"1986","unstructured":"Lawrence Rabiner and Biinghwang Juang. 1986. An introduction to hidden Markov models. ieee assp magazine, Vol. 3, 1 (1986), 4--16."},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PMLR, 28492--28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492--28518."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/89.365379"},{"key":"e_1_3_2_1_32_1","volume-title":"wav2vec: Unsupervised pre-training for speech recognition. arXiv preprint arXiv:1904.05862","author":"Schneider Steffen","year":"2019","unstructured":"Steffen Schneider, Alexei Baevski, Ronan Collobert, and Michael Auli. 2019. wav2vec: Unsupervised pre-training for speech recognition. arXiv preprint arXiv:1904.05862 (2019)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_34_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.202"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413761"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00381"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461404"},{"key":"e_1_3_2_1_40_1","volume-title":"Pmc-vqa: Visual instruction tuning for medical visual question answering. arXiv preprint arXiv:2305.10415","author":"Zhang Xiaoman","year":"2023","unstructured":"Xiaoman Zhang, Chaoyi Wu, Ziheng Zhao, Weixiong Lin, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2023. Pmc-vqa: Visual instruction tuning for medical visual question answering. arXiv preprint arXiv:2305.10415 (2023)."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733384","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:08:01Z","timestamp":1755749281000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733384"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":40,"alternative-id":["10.1145\/3731715.3733384","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733384","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}