{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:55:58Z","timestamp":1781538958058,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Dalian Key Field Innovation Team Support Plan","award":["2020RT07"],"award-info":[{"award-number":["2020RT07"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810836","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1568-1572","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Locate Core, Refine Path: A Training-Free Closed-Loop Paradigm for Referring Video Object Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-8365-2317","authenticated-orcid":false,"given":"Jizhe","family":"Yu","sequence":"first","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0232-5084","authenticated-orcid":false,"given":"Hao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8324-2422","authenticated-orcid":false,"given":"Xiya","family":"Bu","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7667-3956","authenticated-orcid":false,"given":"Yuhang","family":"Duan","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6350-3608","authenticated-orcid":false,"given":"Xiaoshuai","family":"Wu","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8013-4372","authenticated-orcid":false,"given":"Yu","family":"Liu","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"e_1_3_3_2_3_2","unstructured":"Suhwan Cho Seunghoon Lee Minhyeok Lee Jungho Lee and Sangyoun Lee. 2025. Find First Track Next: Decoupling Identification and Propagation in Referring Video Object Segmentation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.03492 (2025)."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888377"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00254"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01290"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446192"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02717"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01234"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01266"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32387"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20870-7_8"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01861"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Zhuoyan Luo Yicheng Xiao Yong Liu Shuyan Li Yitong Wang Yansong Tang Xiu Li and Yujiu Yang. 2023. Soc: Semantic-assisted object cluster for referring video object segmentation. Advances in Neural Information Processing Systems 36 (2023) 26425\u201326437.","DOI":"10.52202\/075280-1149"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01776"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"e_1_3_3_2_18_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_2_19_2","unstructured":"Nikhila Ravi Valentin Gabeur Yuan-Ting Hu Ronghang Hu Chaitanya\u00a0K. Ryali Tengyu Ma Haitham Khedr Roman R\u00e4dle Chlo\u00e9 Rolland Laura Gustafson Eric Mintun Junting Pan Kalyan\u00a0Vasudev Alwala Nicolas Carion Chao-Yuan Wu Ross\u00a0B. Girshick Piotr Doll\u2019ar and Christoph Feichtenhofer. 2024. SAM 2: Segment Anything in Images and Videos. ArXiv abs\/2408.00714 (2024). https:\/\/api.semanticscholar.org\/CorpusID:271601113"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Fu Rong Meng Lan Qian Zhang and Lefei Zhang. 2025. MPG-SAM 2: Adapting SAM 2 with Mask Priors and Global Context for Referring Video Object Segmentation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.13667 (2025).","DOI":"10.1109\/ICCV51701.2025.02223"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_13"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Zeyi Sun Ye Fang Tong Wu Pan Zhang Yuhang Zang Shu Kong Yuanjun Xiong Dahua Lin and Jiaqi Wang. 2023. Alpha-CLIP: A CLIP Model Focusing on Wherever you Want. 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023) 13019\u201313029. https:\/\/api.semanticscholar.org\/CorpusID:266055413","DOI":"10.1109\/CVPR52733.2024.01237"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447901"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00259"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00492"},{"key":"e_1_3_3_2_26_2","unstructured":"Ning Xu Linjie Yang Yuchen Fan Dingcheng Yue Yuchen Liang Jianchao Yang and Thomas Huang. 2018. Youtube-vos: A large-scale video object segmentation benchmark. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1809.03327 (2018)."},{"key":"e_1_3_3_2_27_2","first-page":"98","volume-title":"European Conference on Computer Vision","author":"Yan Cilin","year":"2024","unstructured":"Cilin Yan, Haochen Wang, Shilin Yan, Xiaolong Jiang, Yao Hu, Guoliang Kang, Weidi Xie, and Efstratios Gavves. 2024. Visa: Reasoning video object segmentation via large language models. In European Conference on Computer Vision. Springer, 98\u2013115."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Jiaxing Yang Lihe Zhang and Huchuan Lu. 2025. Semantics Alternating Enhancement and Bidirectional Aggregation for Referring Video Object Segmentation. IEEE Transactions on Multimedia (2025).","DOI":"10.1109\/TMM.2025.3557689"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01328"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446511"},{"key":"e_1_3_3_2_31_2","unstructured":"Yuxuan Zhang Tianheng Cheng Rui Hu Lei Liu Heng Liu Longjin Ran Xiaoxin Chen Wenyu Liu and Xinggang Wang. 2024. EVF-SAM: Early Vision-Language Fusion for Text-Prompted Segment Anything Model. ArXiv abs\/2406.20076 (2024). https:\/\/api.semanticscholar.org\/CorpusID:270845560"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746059.3747676"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02647"},{"key":"e_1_3_3_2_34_2","first-page":"452","volume-title":"European Conference on Computer Vision","author":"Zhu Zixin","year":"2024","unstructured":"Zixin Zhu, Xuelu Feng, Dongdong Chen, Junsong Yuan, Chunming Qiao, and Gang Hua. 2024. Exploring pre-trained text-to-video diffusion models for referring video object segmentation. In European Conference on Computer Vision. Springer, 452\u2013469."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:22:17Z","timestamp":1781536937000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810836"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":33,"alternative-id":["10.1145\/3805622.3810836","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810836","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}