{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T17:57:44Z","timestamp":1772301464290,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Natural Science Foundation of Tianjin, China","award":["22JCQNJC01580, 22JCJQJC00150"],"award-info":[{"award-number":["22JCQNJC01580, 22JCJQJC00150"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62302243, 62272250"],"award-info":[{"award-number":["62302243, 62272250"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Fundamental Research Funds for the Central Universities, Nankai University","award":["63241442"],"award-info":[{"award-number":["63241442"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680817","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"5356-5364","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Tracking-forced Referring Video Object Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7793-4030","authenticated-orcid":false,"given":"Ruxue","family":"Yan","sequence":"first","affiliation":[{"name":"VCIP, TMCC, TBI Center, College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5609-194X","authenticated-orcid":false,"given":"Wenya","family":"Guo","sequence":"additional","affiliation":[{"name":"VCIP, TMCC, TBI Center, College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8257-6030","authenticated-orcid":false,"given":"Xubo","family":"Liu","sequence":"additional","affiliation":[{"name":"VCIP, TMCC, TBI Center, College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8902-9893","authenticated-orcid":false,"given":"Xumeng","family":"Liu","sequence":"additional","affiliation":[{"name":"VCIP, TMCC, TBI Center, College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4906-5828","authenticated-orcid":false,"given":"Ying","family":"Zhang","sequence":"additional","affiliation":[{"name":"VCIP, TMCC, TBI Center, College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5876-6856","authenticated-orcid":false,"given":"Xiaojie","family":"Yuan","sequence":"additional","affiliation":[{"name":"VCIP, TMCC, TBI Center, College of Computer Science, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"End-to-End Referring Video Object Segmentation with Multimodal Transformers. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022","author":"Botach Adam","year":"2022","unstructured":"Adam Botach, Evgenii Zheltonozhskii, and Chaim Baskin. 2022. End-to-End Referring Video Object Segmentation with Multimodal Transformers. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18--24, 2022. IEEE, 4975--4985."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2306.08707"},{"key":"e_1_3_2_1_3_1","volume-title":"The 3rd Large-scale Video Object Segmentation Challenge","volume":"8","author":"Ding Zihan","year":"2021","unstructured":"Zihan Ding, Tianrui Hui, Shaofei Huang, Si Liu, Xuan Luo, Junshi Huang, and Xiaoming Wei. 2021. Progressive multimodal interaction network for referring video object segmentation. The 3rd Large-scale Video Object Segmentation Challenge, Vol. 8 (2021), 6."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00624"},{"key":"e_1_3_2_1_5_1","volume-title":"Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27--30, 2016. IEEE Computer Society, 770--778."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3--319--46448-0_7"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00548"},{"key":"e_1_3_2_1_8_1","volume-title":"Collaborative Spatial-Temporal Modeling for Language-Queried Video Actor Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021","author":"Hui Tianrui","year":"2021","unstructured":"Tianrui Hui, Shaofei Huang, Si Liu, Zihan Ding, Guanbin Li, Wenguan Wang, Jizhong Han, and Fei Wang. 2021. Collaborative Spatial-Temporal Modeling for Language-Queried Video Actor Segmentation. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, virtual, June 19--25, 2021. Computer Vision Foundation \/ IEEE, 4187--4196."},{"key":"e_1_3_2_1_9_1","volume-title":"Towards Understanding Action Recognition. In IEEE International Conference on Computer Vision, ICCV 2013","author":"Jhuang Hueihan","year":"2013","unstructured":"Hueihan Jhuang, Juergen Gall, Silvia Zuffi, Cordelia Schmid, and Michael J. Black. 2013. Towards Understanding Action Recognition. In IEEE International Conference on Computer Vision, ICCV 2013, Sydney, Australia, December 1--8, 2013. IEEE Computer Society, 3192--3199."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s12559-023-10215-7"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-030--20870--7_8"},{"key":"e_1_3_2_1_12_1","volume-title":"Shape-Aware Text-Driven Layered Video Editing. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023","author":"Lee Yao-Chih","year":"2023","unstructured":"Yao-Chih Lee, Ji-Ze Genevieve Jang, Yi-Ting Chen, Elizabeth Qiu, and Jia-Bin Huang. 2023. Shape-Aware Text-Driven Layered Video Editing. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17--24, 2023. IEEE, 14317--14326."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2023.3323665"},{"key":"e_1_3_2_1_14_1","volume-title":"ClawCraneNet: Leveraging Object-level Relation for Text-based Video Segmentation. CoRR","author":"Liang Chen","year":"2021","unstructured":"Chen Liang, Yu Wu, Yawei Luo, and Yi Yang. 2021. ClawCraneNet: Leveraging Object-level Relation for Text-based Video Segmentation. CoRR, Vol. abs\/2103.10702 (2021). showeprint[arXiv]2103.10702 https:\/\/arxiv.org\/abs\/2103.10702"},{"key":"e_1_3_2_1_15_1","volume-title":"Rethinking Cross-modal Interaction from a Top-down Perspective for Referring Video Object Segmentation. CoRR","author":"Liang Chen","year":"2021","unstructured":"Chen Liang, Yu Wu, Tianfei Zhou, Wenguan Wang, Zongxin Yang, Yunchao Wei, and Yi Yang. 2021. Rethinking Cross-modal Interaction from a Top-down Perspective for Referring Video Object Segmentation. CoRR, Vol. abs\/2106.01061 (2021). showeprint[arXiv]2106.01061 https:\/\/arxiv.org\/abs\/2106.01061"},{"key":"e_1_3_2_1_16_1","volume-title":"Focal Loss for Dense Object Detection. In IEEE International Conference on Computer Vision, ICCV 2017","author":"Lin Tsung-Yi","year":"2017","unstructured":"Tsung-Yi Lin, Priya Goyal, Ross B. Girshick, Kaiming He, and Piotr Doll\u00e1r. 2017. Focal Loss for Dense Object Detection. In IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, October 22--29, 2017. IEEE Computer Society, 2999--3007."},{"key":"e_1_3_2_1_17_1","first-page":"4761","article-title":"Cross-Modal Progressive Comprehension for Referring Segmentation","volume":"44","author":"Liu Si","year":"2022","unstructured":"Si Liu, Tianrui Hui, Shaofei Huang, Yunchao Wei, Bo Li, and Guanbin Li. 2022. Cross-Modal Progressive Comprehension for Referring Segmentation. IEEE Trans. Pattern Anal. Mach. Intell., Vol. 44, 9 (2022), 4761--4775.","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"e_1_3_2_1_18_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR, Vol. abs\/1907.11692 (2019). showeprint[arXiv]1907.11692 http:\/\/arxiv.org\/abs\/1907.11692"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_20_1","volume-title":"Video Swin Transformer. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022","author":"Liu Ze","year":"2022","unstructured":"Ze Liu, Jia Ning, Yue Cao, Yixuan Wei, Zheng Zhang, Stephen Lin, and Han Hu. 2022. Video Swin Transformer. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18--24, 2022. IEEE, 3192--3201."},{"key":"e_1_3_2_1_21_1","volume-title":"Visual-Textual Capsule Routing for Text-Based Video Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020","author":"McIntosh Bruce","year":"2020","unstructured":"Bruce McIntosh, Kevin Duarte, Yogesh S. Rawat, and Mubarak Shah. 2020. Visual-Textual Capsule Routing for Text-Based Video Segmentation. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020, Seattle, WA, USA, June 13--19, 2020. Computer Vision Foundation \/ IEEE, 9939--9948."},{"key":"e_1_3_2_1_22_1","volume-title":"V-Net: Fully Convolutional Neural Networks for Volumetric Medical Image Segmentation. In Fourth International Conference on 3D Vision, 3DV 2016","author":"Milletari Fausto","year":"2016","unstructured":"Fausto Milletari, Nassir Navab, and Seyed-Ahmad Ahmadi. 2016. V-Net: Fully Convolutional Neural Networks for Volumetric Medical Image Segmentation. In Fourth International Conference on 3D Vision, 3DV 2016, Stanford, CA, USA, October 25--28, 2016. IEEE Computer Society, 565--571."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/132"},{"key":"e_1_3_2_1_24_1","volume-title":"The 2017 DAVIS Challenge on Video Object Segmentation. CoRR","author":"Pont-Tuset Jordi","year":"2017","unstructured":"Jordi Pont-Tuset, Federico Perazzi, Sergi Caelles, Pablo Arbelaez, Alexander Sorkine-Hornung, and Luc Van Gool. 2017. The 2017 DAVIS Challenge on Video Object Segmentation. CoRR, Vol. abs\/1704.00675 (2017)."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings, Part XV (Lecture Notes in Computer Science","volume":"223","author":"Seo Seonguk","year":"2020","unstructured":"Seonguk Seo, Joon-Young Lee, and Bohyung Han. 2020. URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark. In Computer Vision - ECCV 2020 - 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XV (Lecture Notes in Computer Science, Vol. 12360). Springer, 208--223."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6895"},{"key":"e_1_3_2_1_27_1","volume-title":"Asymmetric Cross-Guided Attention Network for Actor and Action Video Segmentation From Natural Language Query. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019","author":"Wang Hao","year":"2019","unstructured":"Hao Wang, Cheng Deng, Junchi Yan, and Dacheng Tao. 2019. Asymmetric Cross-Guided Attention Network for Actor and Action Video Segmentation From Natural Language Query. In 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019, Seoul, Korea (South), October 27 - November 2, 2019. IEEE, 3938--3947."},{"key":"e_1_3_2_1_28_1","volume-title":"OnlineRefer: A Simple Online Baseline for Referring Video Object Segmentation. In IEEE\/CVF International Conference on Computer Vision, ICCV 2023","author":"Wu Dongming","year":"2023","unstructured":"Dongming Wu, Tiancai Wang, Yuang Zhang, Xiangyu Zhang, and Jianbing Shen. 2023. OnlineRefer: A Simple Online Baseline for Referring Video Object Segmentation. In IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, October 1--6, 2023. IEEE, 2749--2758."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00492"},{"key":"e_1_3_2_1_30_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2015","author":"Xu Chenliang","year":"2015","unstructured":"Chenliang Xu, Shao-Hang Hsieh, Caiming Xiong, and Jason J. Corso. 2015. Can humans fly? Action understanding with multiple classes of actors. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2015, Boston, MA, USA, June 7--12, 2015. IEEE Computer Society, 2264--2273."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings, Part V (Lecture Notes in Computer Science","volume":"619","author":"Xu Ning","unstructured":"Ning Xu, Linjie Yang, Yuchen Fan, Jianchao Yang, Dingcheng Yue, Yuchen Liang, Brian L. Price, Scott Cohen, and Thomas S. Huang. 2018. YouTube-VOS: Sequence-to-Sequence Video Object Segmentation. In Computer Vision - ECCV 2018 - 15th European Conference, Munich, Germany, September 8--14, 2018, Proceedings, Part V (Lecture Notes in Computer Science, Vol. 11209). Springer, 603--619."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01075"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3054384"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings, Part II (Lecture Notes in Computer Science","volume":"85","author":"Yu Licheng","unstructured":"Licheng Yu, Patrick Poirson, Shan Yang, Alexander C. Berg, and Tamara L. Berg. 2016. Modeling Context in Referring Expressions. In Computer Vision - ECCV 2016 - 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part II (Lecture Notes in Computer Science, Vol. 9906). Springer, 69--85."},{"key":"e_1_3_2_1_35_1","volume-title":"Deformable DETR: Deformable Transformers for End-to-End Object Detection. In 9th International Conference on Learning Representations, ICLR 2021","author":"Zhu Xizhou","year":"2021","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2021. Deformable DETR: Deformable Transformers for End-to-End Object Detection. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3--7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=gZ9hCDWe6ke"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680817","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680817","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:07Z","timestamp":1750295887000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680817"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":35,"alternative-id":["10.1145\/3664647.3680817","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680817","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}