{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:57Z","timestamp":1781539077765,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810843","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"977-985","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Decoupling Multimodal Perception and Reasoning for Image Reasoning Segmentation with Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7866-3339","authenticated-orcid":false,"given":"Yiqing","family":"Shen","sequence":"first","affiliation":[{"name":"Johns Hopkins University, Baltimore, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9574-0691","authenticated-orcid":false,"given":"Feng","family":"Chen","sequence":"additional","affiliation":[{"name":"JF Smartlnvest Holdings, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Alibaba\u00a0DAMO Academy. 2025. Qwen 2.5: The Next-Generation Open-Source Large Language Model. https:\/\/github.com\/QwenLM\/Qwen2.5"},{"key":"e_1_3_3_1_3_2","unstructured":"Jean-Baptiste Alayrac Jeff Donahue et\u00a0al. 2022. Flamingo: A Visual Language Model for Few-Shot Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.14198 (2022). https:\/\/arxiv.org\/abs\/2204.14198"},{"key":"e_1_3_3_1_4_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang Humen Zhong Yuanzhi Zhu Mingkun Yang Zhaohai Li Jianqiang Wan Pengfei Wang Wei Ding Zheren Fu Yiheng Xu Jiabo Ye Xi Zhang Tianbao Xie Zesen Cheng Hang Zhang Zhibo Yang Haiyang Xu and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2502.13923\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2502.13923"},{"key":"e_1_3_3_1_5_2","unstructured":"Xiaoyi Bao Siyang Sun Shuailei Ma Kecheng Zheng Yuxin Guo Guosheng Zhao Yun Zheng and Xingang Wang. 2024. CoReS: Orchestrating the Dance of Reasoning and Segmentation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.05673 (2024)."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Arne Bilberg and Ali\u00a0Ahmad Malik. 2019. Digital twin driven human\u2013robot collaborative assembly. CIRP Annals 68 1 (2019) 499\u2013502. doi:10.1016\/j.cirp.2019.04.011","DOI":"10.1016\/j.cirp.2019.04.011"},{"key":"e_1_3_3_1_7_2","unstructured":"Tom\u00a0B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language Models are Few-Shot Learners. Advances in Neural Information Processing Systems (NeurIPS) 33 (2020) 1877\u20131901. arXiv:https:\/\/arXiv.org\/abs\/2005.14165doi:10.48550\/arXiv.2005.14165"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Yubao Chen. 2017. Integrated and Intelligent Manufacturing: Perspectives and Enablers. Engineering 3 5 (2017) 588\u2013595. doi:10.1016\/J.ENG.2017.04.009","DOI":"10.1016\/J.ENG.2017.04.009"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Yi-Chia Chen Wei-Hua Li Cheng Sun Yu-Chiang\u00a0Frank Wang and Chu-Song Chen. 2024. SAM4MLLM: Enhance Multi-Modal Large Language Model for Referring Expression Segmentation. arxiv:https:\/\/arXiv.org\/abs\/2409.10542\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2409.10542","DOI":"10.1007\/978-3-031-73004-7_19"},{"key":"e_1_3_3_1_10_2","unstructured":"Damai Dai Li Dong Furu Wei Bin Xu Zhifang Liu and Zhifang Sui. 2023. In-Context Learning: A Comprehensive Survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.00234 (2023). doi:10.48550\/arXiv.2301.00234"},{"key":"e_1_3_3_1_11_2","unstructured":"DeepSeek-AI. 2025. DeepSeek-V3 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2412.19437\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2412.19437"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Hao Ding Lalithkumar Seenivasan Benjamin\u00a0D. Killeen Sue\u00a0Min Cho and Mathias Unberath. 2024. Digital twins as a unifying framework for surgical data science: the enabling role of geometric scene understanding. Artificial Intelligence Surgery 4 3 (2024). doi:10.20517\/ais.2024.16","DOI":"10.20517\/ais.2024.16"},{"key":"e_1_3_3_1_13_2","unstructured":"Vlad Fomenko Han Yu Jongho Lee Stanley Hsieh and Weizhu Chen. 2024. A Note on LoRA. arxiv:https:\/\/arXiv.org\/abs\/2404.05086\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2404.05086"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Edward Glaessgen and David Stargel. 2012. The digital twin paradigm for future NASA and U.S. air force vehicles. doi:10.2514\/6.2012-1818","DOI":"10.2514\/6.2012-1818"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Edward Glaessgen and David Stargel. 2012. The digital twin paradigm for future NASA and U.S. air force vehicles. doi:10.2514\/6.2012-1818","DOI":"10.2514\/6.2012-1818"},{"key":"e_1_3_3_1_16_2","unstructured":"Edward\u00a0J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. LoRA: Low-Rank Adaptation of Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.09685 (2021). doi:10.48550\/arXiv.2106.09685"},{"key":"e_1_3_3_1_17_2","unstructured":"Baidu Inc.2025. ERNIE Bot (Wenxin Yiyan). https:\/\/yiyan.baidu.com\/"},{"key":"e_1_3_3_1_18_2","unstructured":"Laurynas Karazija Iro Laina and Christian Rupprecht. 2021. ClevrTex: A Texture-Rich Benchmark for Unsupervised Multi-Object Segmentation. arxiv:https:\/\/arXiv.org\/abs\/2111.10265\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2111.10265"},{"key":"e_1_3_3_1_19_2","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander\u00a0C. Berg Wan-Yen Lo Piotr Doll\u00e1r and Ross Girshick. 2023. Segment Anything. arxiv:https:\/\/arXiv.org\/abs\/2304.02643\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2304.02643"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"e_1_3_3_1_21_2","first-page":"1288","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven C.\u00a0H. Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 1288\u20131300. doi:10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_3_1_22_2","unstructured":"Chang Liu Henghui Ding and Xudong Jiang. 2023. GRES: Generalized Referring Expression Segmentation. arxiv:https:\/\/arXiv.org\/abs\/2306.00968\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2306.00968"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Zheng Liu Norbert Meyendorf and Nezih Mrad. 2018. The role of data fusion in predictive maintenance using digital twin. AIP Conference Proceedings 1949 1 (04 2018) 020023. arXiv:https:\/\/pubs.aip.org\/aip\/acp\/article-pdf\/doi\/10.1063\/1.5031520\/13848976\/020023_1_online.pdfdoi:10.1063\/1.5031520","DOI":"10.1063\/1.5031520"},{"key":"e_1_3_3_1_24_2","unstructured":"Junhua Mao Jonathan Huang Alexander Toshev Oana Camburu Alan Yuille and Kevin Murphy. 2016. Generation and Comprehension of Unambiguous Object Descriptions. arxiv:https:\/\/arXiv.org\/abs\/1511.02283\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1511.02283"},{"key":"e_1_3_3_1_25_2","unstructured":"Matthias Minderer Alexey Gritsenko and Neil Houlsby. 2024. Scaling Open-Vocabulary Object Detection. arxiv:https:\/\/arXiv.org\/abs\/2306.09683\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2306.09683"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/SSCI.2017.8285439"},{"key":"e_1_3_3_1_27_2","unstructured":"OpenAI. 2024. GPT-4o-mini: Lightweight Multimodal Language Model. https:\/\/platform.openai.com\/docs\/models\/gpt-4o-mini Version: gpt-4o-mini-2024-07-18."},{"key":"e_1_3_3_1_28_2","unstructured":"OpenAI. 2024. GPT-4o: Multimodal Language Model with Omni Capabilities. https:\/\/platform.openai.com\/docs\/models\/gpt-4o Version: gpt-4o-2024-05-13."},{"key":"e_1_3_3_1_29_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark et\u00a0al. 2021. CLIP: Connecting Text and Images. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2103.00020 (2021). https:\/\/arxiv.org\/abs\/2103.00020 Foundational work on vision-language pretraining."},{"key":"e_1_3_3_1_30_2","unstructured":"Meta\u00a0AI Research. 2024. Llama 3.3-70B-Instruct: A Cost-Efficient Open Foundation Model. https:\/\/huggingface.co\/meta-llama\/Llama-3.3-70B-Instruct"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_3_1_32_2","unstructured":"Hongchao Shu Ruixing Liang Zhaoshuo Li Anna Goodridge Xiangyu Zhang Hao Ding Nimesh Nagururu Manish Sahu Francis\u00a0X. Creighton Russell\u00a0H. Taylor Adnan Munawar and Mathias Unberath. 2023. Twin-S: A Digital Twin for Skull-base Surgery. arxiv:https:\/\/arXiv.org\/abs\/2211.11863\u00a0[cs.HC] https:\/\/arxiv.org\/abs\/2211.11863"},{"key":"e_1_3_3_1_33_2","unstructured":"Hugo Touvron et\u00a0al. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023). https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00183"},{"key":"e_1_3_3_1_35_2","unstructured":"Wenhai Wang Zhe Chen Xiaokang Chen Jiannan Wu Xizhou Zhu Gang Zeng Ping Luo Tong Lu Jie Zhou Yu Qiao and Jifeng Dai. 2023. VisionLLM: Large Language Model is also an Open-Ended Decoder for Vision-Centric Tasks. arxiv:https:\/\/arXiv.org\/abs\/2305.11175\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2305.11175"},{"key":"e_1_3_3_1_36_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Wang XuDong","year":"2025","unstructured":"XuDong Wang, Shaolun Zhang, Shufan Li, Kehan Li, Konstantinos Kallidromitis, Yusuke Kato, Kazuki Kozuka, and Trevor Darrell. 2025. SegLLM: Multi-round Reasoning Segmentation with Large Language Models. In The Thirteenth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=Pm1NXHgzyf"},{"key":"e_1_3_3_1_37_2","unstructured":"Cong Wei Haoxian Tan Yujie Zhong Yujiu Yang and Lin Ma. 2024. LaSagnA: Language-based Segmentation Assistant for Complex Queries. arxiv:https:\/\/arXiv.org\/abs\/2404.08506\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2404.08506"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Ed Chi Quoc Le and Denny Zhou. 2022. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. Advances in Neural Information Processing Systems (NeurIPS) 35 (2022) 24824\u201324837.","DOI":"10.52202\/068431-1800"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00370"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Daoguang Yang Hamid Karimi Okyay Kaynak and Shen Yin. 2021. Developments of digital twin technologies in industrial smart city and healthcare sectors: a survey. Complex Engineering Systems 1 (09 2021). doi:10.20517\/ces.2021.06","DOI":"10.20517\/ces.2021.06"},{"key":"e_1_3_3_1_41_2","unstructured":"Senqiao Yang Tianyuan Qu Xin Lai et\u00a0al. 2024. LISA++: An Improved Baseline for Reasoning Segmentation with Large Language Model. arxiv:https:\/\/arXiv.org\/abs\/2312.17240\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2312.17240"},{"key":"e_1_3_3_1_42_2","unstructured":"Yuqi Yang Peng-Tao Jiang Jing Wang Hao Zhang Kai Zhao Jinwei Chen and Bo Li. 2024. Empowering Segmentation Ability to Multi-modal Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2403.14141\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2403.14141"},{"key":"e_1_3_3_1_43_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. 2023. ReAct: Synergizing Reasoning and Acting in Language Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Zhaohui Zheng Ping Wang Wei Liu Jinze Li Rongguang Ye and Dongwei Ren. 2020. Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression. Proceedings of the AAAI Conference on Artificial Intelligence 34 07 (2020) 12993\u201313000. doi:10.1609\/aaai.v34i07.6999","DOI":"10.1609\/aaai.v34i07.6999"},{"key":"e_1_3_3_1_46_2","unstructured":"Denny Zhou Hong Liu Jaehoon Lee and Aviral Kumar. 2024. Chain-of-Thought Reasoning without Prompting: Mining the Intrinsic Reasoning Abilities of Language Models. arXiv preprint (2024). doi:10.48550\/arXiv.2408.03314"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:50:51Z","timestamp":1781538651000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810843"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":45,"alternative-id":["10.1145\/3805622.3810843","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810843","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}