{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T09:08:08Z","timestamp":1769850488255,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754736","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"2742-2751","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["MediSee: Reasoning-Based Pixel-Level Perception in Medical Images"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1007-9349","authenticated-orcid":false,"given":"Qinyue","family":"Tong","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3579-9130","authenticated-orcid":false,"given":"Ziqian","family":"Lu","sequence":"additional","affiliation":[{"name":"Zhejiang Sci-Tech University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4759-2276","authenticated-orcid":false,"given":"Jun","family":"Liu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7577-4778","authenticated-orcid":false,"given":"Yangming","family":"Zheng","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1785-7847","authenticated-orcid":false,"given":"Zhe-Ming","family":"Lu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Michela Antonelli Annika Reinke Spyridon Bakas Keyvan Farahani Annette Kopp-Schneider Bennett A Landman Geert Litjens Bjoern Menze Olaf Ronneberger Ronald M Summers et al. 2022. The medical segmentation decathlon. Nature communications Vol. 13 1 (2022) 4128.","DOI":"10.1038\/s41467-022-30695-9"},{"key":"e_1_3_2_1_3_1","unstructured":"Ujjwal Baid Satyam Ghodasara Suyash Mohan Michel Bilello Evan Calabrese Errol Colak Keyvan Farahani Jayashree Kalpathy-Cramer Felipe C Kitamura Sarthak Pati et al. 2021. The rsna-asnr-miccai brats 2021 benchmark on brain tumor segmentation and radiogenomic classification. arXiv preprint arXiv:2107.02314 (2021)."},{"key":"e_1_3_2_1_4_1","volume-title":"Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling. arXiv preprint arXiv:2501.17811","author":"Chen Xiaokang","year":"2025","unstructured":"Xiaokang Chen, Zhiyu Wu, Xingchao Liu, Zizheng Pan, Wen Liu, Zhenda Xie, Xingkai Yu, and Chong Ruan. 2025. Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling. arXiv preprint arXiv:2501.17811 (2025)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Junlong Cheng Bin Fu Jin Ye Guoan Wang Tianbin Li Haoyu Wang Ruoyu Li He Yao Junren Chen JingWen Li et al. 2024. Interactive medical image segmentation: A benchmark dataset and baseline. arXiv preprint arXiv:2411.12814 (2024).","DOI":"10.1109\/CVPR52734.2025.01941"},{"key":"e_1_3_2_1_6_1","unstructured":"Junlong Cheng Jin Ye Zhongying Deng Jianpin Chen Tianbin Li Haoyu Wang Yanzhou Su Ziyan Huang Jilong Chen Lei Jiang et al. 2023. Sam-med2d. arXiv preprint arXiv:2308.16184 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46976-8_19"},{"key":"e_1_3_2_1_8_1","first-page":"110746","article-title":"Segvol: Universal and interactive volumetric medical image segmentation","volume":"37","author":"Du Yuxin","year":"2024","unstructured":"Yuxin Du, Fan Bai, Tiejun Huang, and Bo Zhao. 2024. Segvol: Universal and interactive volumetric medical image segmentation. Advances in Neural Information Processing Systems, Vol. 37 (2024), 110746-110783.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","unstructured":"Nicholas Heller Fabian Isensee Dasha Trofimova Resha Tejpaul Zhongchen Zhao Huai Chen Lisheng Wang Alex Golts Daniel Khapun Daniel Shats et al. 2023. The kits21 challenge: Automatic segmentation of kidneys renal tumors and renal cysts in corticomedullary-phase ct. arXiv preprint arXiv:2307.01984 (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.3390\/data5010014"},{"key":"e_1_3_2_1_11_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. Lora: Low-rank adaptation of large language models. ICLR, Vol. 1, 2 (2022), 3.","journal-title":"ICLR"},{"key":"e_1_3_2_1_12_1","volume-title":"Jens Petersen, and Klaus H Maier-Hein.","author":"Isensee Fabian","year":"2021","unstructured":"Fabian Isensee, Paul F Jaeger, Simon AA Kohl, Jens Petersen, and Klaus H Maier-Hein. 2021. nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation. Nature methods, Vol. 18, 2 (2021), 203-211."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1038\/s43018-023-00619-7"},{"key":"e_1_3_2_1_14_1","volume-title":"MMR: A Large-scale Benchmark Dataset for Multi-target and Multi-granularity Reasoning Segmentation. arXiv preprint arXiv:2503.13881","author":"Jang Donggon","year":"2025","unstructured":"Donggon Jang, Yucheol Cho, Suin Lee, Taehyeon Kim, and Dae-Shik Kim. 2025. MMR: A Large-scale Benchmark Dataset for Multi-target and Multi-granularity Reasoning Segmentation. arXiv preprint arXiv:2503.13881 (2025)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"e_1_3_2_1_17_1","first-page":"28541","article-title":"Llava-med: Training a large language-and-vision assistant for biomedicine in one day","volume":"36","author":"Li Chunyuan","year":"2023","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2023. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems, Vol. 36 (2023), 28541-28564.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_19_1","volume-title":"European Conference on Computer Vision. Springer, 38-55","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et al., 2024. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European Conference on Computer Vision. Springer, 38-55."},{"key":"e_1_3_2_1_20_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-024-44824-z"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/S2589-7500(24)00154-7"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41591-024-03010-w"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1080\/02564602.2014.906861"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02561"},{"key":"e_1_3_2_1_27_1","volume-title":"Reasoning to Attend: Try to Understand Ho SEG Token Works. arXiv preprint arXiv:2412.17741","author":"Qian Rui","year":"2024","unstructured":"Rui Qian, Xin Yin, and Dejing Dou. 2024. Reasoning to Attend: Try to Understand Ho SEG Token Works. arXiv preprint arXiv:2412.17741 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Ozan Oktay, Konstantinos Kamnitsas, Jonathan Passerat-Palmbach, Wenjia Bai, Mellisa Damodaram, Mary A Rutherford, Joseph V Hajnal","author":"Rajchl Martin","year":"2016","unstructured":"Martin Rajchl, Matthew CH Lee, Ozan Oktay, Konstantinos Kamnitsas, Jonathan Passerat-Palmbach, Wenjia Bai, Mellisa Damodaram, Mary A Rutherford, Joseph V Hajnal, Bernhard Kainz, et al., 2016. Deepcut: Object segmentation from bounding box annotations using convolutional neural networks. IEEE transactions on medical imaging, Vol. 36, 2 (2016), 674-683."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.4108\/eai.12-4-2021.169184"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_32_1","first-page":"234","volume-title":"Munich","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In Medical image computing and computer-assisted intervention-MICCAI 2015: 18th international conference, Munich, Germany, October 5-9, 2015, proceedings, part III 18. Springer, 234-241."},{"key":"e_1_3_2_1_33_1","volume-title":"Catalin Fetita, Pierre-Yves Brillet, Christophe Lefevre, Wenzhe Xue, Xiangjun Zhu, Jianming Liang, Ilkay \u00d6ks\u00fcz, et al.","author":"Rudyanto Rina D","year":"2014","unstructured":"Rina D Rudyanto, Sjoerd Kerkstra, Eva M Van Rikxoort, Catalin Fetita, Pierre-Yves Brillet, Christophe Lefevre, Wenzhe Xue, Xiangjun Zhu, Jianming Liang, Ilkay \u00d6ks\u00fcz, et al., 2014. Comparing algorithms for automated vessel segmentation in computed tomography scans of the lung: the VESSEL12 study. Medical image analysis, Vol. 18, 7 (2014), 1217-1232."},{"key":"e_1_3_2_1_34_1","volume-title":"End-to-end prostate cancer detection in bpMRI via 3D CNNs: effects of attention mechanisms, clinical priori and decoupled false positive reduction. Medical image analysis","author":"Saha Anindo","year":"2021","unstructured":"Anindo Saha, Matin Hosseinzadeh, and Henkjan Huisman. 2021. End-to-end prostate cancer detection in bpMRI via 3D CNNs: effects of attention mechanisms, clinical priori and decoupled false positive reduction. Medical image analysis, Vol. 73 (2021), 102155."},{"key":"e_1_3_2_1_35_1","volume-title":"Interactive segmentation of medical images through fully convolutional neural networks. arXiv","author":"Sakinis T","year":"2019","unstructured":"T Sakinis, F Milletari, H Roth, P Korfiatis, P Kostandy, K Philbrick, Z Akkus, Z Xu, D Xu, and BJ Erickson. 1903. Interactive segmentation of medical images through fully convolutional neural networks. arXiv 2019. arXiv preprint arXiv:1903.08205 (1903)."},{"key":"e_1_3_2_1_36_1","volume-title":"Moira SN Berens, Cas Van Den Bogaard, Piergiorgio Cerello, Hao Chen, Qi Dou, Maria Evelina Fantacci, Bram Geurts, et al.","author":"Adiyoso Setio Arnaud Arindra","year":"2017","unstructured":"Arnaud Arindra Adiyoso Setio, Alberto Traverso, Thomas De Bel, Moira SN Berens, Cas Van Den Bogaard, Piergiorgio Cerello, Hao Chen, Qi Dou, Maria Evelina Fantacci, Bram Geurts, et al., 2017. Validation, comparison, and combination of algorithms for automatic detection of pulmonary nodules in computed tomography images: the LUNA16 challenge. Medical image analysis, Vol. 42 (2017), 1-13."},{"key":"e_1_3_2_1_37_1","volume-title":"Slic-Seg: A minimally interactive segmentation of the placenta from sparse and motion-corrupted fetal MRI in multiple views. Medical image analysis","author":"Wang Guotai","year":"2016","unstructured":"Guotai Wang, Maria A Zuluaga, Rosalind Pratt, Michael Aertsen, Tom Doel, Maria Klusmann, Anna L David, Jan Deprest, Tom Vercauteren, and S\u00e9bastien Ourselin. 2016. Slic-Seg: A minimally interactive segmentation of the placenta from sparse and motion-corrupted fetal MRI in multiple views. Medical image analysis, Vol. 34 (2016), 137-147."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00183"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1049\/ipr2.12419"},{"key":"e_1_3_2_1_40_1","volume-title":"SegLLM: Multi-round Reasoning Segmentation. arXiv preprint arXiv:2410.18923","author":"Wang XuDong","year":"2024","unstructured":"XuDong Wang, Shaolun Zhang, Shufan Li, Konstantinos Kallidromitis, Kehan Li, Yusuke Kato, Kazuki Kozuka, and Trevor Darrell. 2024. SegLLM: Multi-round Reasoning Segmentation. arXiv preprint arXiv:2410.18923 (2024)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2944958"},{"key":"e_1_3_2_1_42_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_43_1","volume-title":"European Conference on Computer Vision. Springer, 207-229","author":"Wong Hallee E","year":"2024","unstructured":"Hallee E Wong, Marianne Rakic, John Guttag, and Adrian V Dalca. 2024. Scribbleprompt: fast and flexible interactive segmentation for any biomedical image. In European Conference on Computer Vision. Springer, 207-229."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1117\/1.JMI.5.3.036501"},{"key":"e_1_3_2_1_45_1","volume-title":"LISA: An Improved Baseline for Reasoning Segmentation with Large Language Model. arXiv preprint arXiv:2312.17240","author":"Yang Senqiao","year":"2023","unstructured":"Senqiao Yang, Tianyuan Qu, Xin Lai, Zhuotao Tian, Bohao Peng, Shu Liu, and Jiaya Jia. 2023. LISA: An Improved Baseline for Reasoning Segmentation with Large Language Model. arXiv preprint arXiv:2312.17240 (2023)."},{"key":"e_1_3_2_1_46_1","unstructured":"Jin Ye Junlong Cheng Jianpin Chen Zhongying Deng Tianbin Li Haoyu Wang Yanzhou Su Ziyan Huang Jilong Chen Lei Jiang et al. 2023. Sa-med2d-20m dataset: Segment anything in 2d medical imaging with 20 million masks. arXiv preprint arXiv:2311.11969 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Next-chat: An lmm for chat, detection and segmentation. arXiv preprint arXiv:2311.04498","author":"Zhang Ao","year":"2023","unstructured":"Ao Zhang, Yuan Yao, Wei Ji, Zhiyuan Liu, and Tat-Seng Chua. 2023. Next-chat: An lmm for chat, detection and segmentation. arXiv preprint arXiv:2311.04498 (2023)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.artmed.2020.101998"},{"key":"e_1_3_2_1_49_1","volume-title":"Sid Kiblawi, Tristan Naumann, Jianfeng Gao, Angela Crabtree, Jacob Abel, et al.","author":"Zhao Theodore","year":"2024","unstructured":"Theodore Zhao, Yu Gu, Jianwei Yang, Naoto Usuyama, Ho Hin Lee, Sid Kiblawi, Tristan Naumann, Jianfeng Gao, Angela Crabtree, Jacob Abel, et al., 2024. A foundation model for joint segmentation, detection and recognition of biomedical objects across nine modalities. Nature methods (2024), 1-11."},{"key":"e_1_3_2_1_50_1","volume-title":"One model to rule them all: Towards universal segmentation for medical images with text prompts. arXiv preprint arXiv:2312.17183","author":"Zhao Ziheng","year":"2023","unstructured":"Ziheng Zhao, Yao Zhang, Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2023. One model to rule them all: Towards universal segmentation for medical images with text prompts. arXiv preprint arXiv:2312.17183 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2022.102599"},{"key":"e_1_3_2_1_52_1","volume-title":"DLMIA 2018, and 8th international workshop, ML-CDS 2018, held in conjunction with MICCAI 2018, Granada, Spain, September 20, 2018, proceedings 4. Springer, 3-11","author":"Zhou Zongwei","year":"2018","unstructured":"Zongwei Zhou, Md Mahfuzur Rahman Siddiquee, Nima Tajbakhsh, and Jianming Liang. 2018. Unet: A nested u-net architecture for medical image segmentation. In Deep learning in medical image analysis and multimodal learning for clinical decision support: 4th international workshop, DLMIA 2018, and 8th international workshop, ML-CDS 2018, held in conjunction with MICCAI 2018, Granada, Spain, September 20, 2018, proceedings 4. Springer, 3-11."},{"key":"e_1_3_2_1_53_1","volume-title":"Uni-med: a unified medical generalist foundation model for multi-task learning via connector-MoE. arXiv preprint arXiv:2409.17508","author":"Zhu Xun","year":"2024","unstructured":"Xun Zhu, Ying Hu, Fanbin Mo, Miao Li, and Ji Wu. 2024. Uni-med: a unified medical generalist foundation model for multi-task learning via connector-MoE. arXiv preprint arXiv:2409.17508 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754736","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:05:23Z","timestamp":1765339523000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754736"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":53,"alternative-id":["10.1145\/3746027.3754736","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754736","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}