{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:26:44Z","timestamp":1759332404820,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022ZD0162000"],"award-info":[{"award-number":["2022ZD0162000"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62222211,62077041,61836002,62072397"],"award-info":[{"award-number":["62222211,62077041,61836002,62072397"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611879","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"5807-5818","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Unsupervised Domain Adaptation for Referring Semantic Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6666-3355","authenticated-orcid":false,"given":"Haonan","family":"Shi","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4300-7694","authenticated-orcid":false,"given":"Wenwen","family":"Pan","sequence":"additional","affiliation":[{"name":"Hangzhou Dianzi University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6121-0384","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2296-2983","authenticated-orcid":false,"given":"Mingmin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2139-8807","authenticated-orcid":false,"given":"Fei","family":"Wu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btl242"},{"key":"e_1_3_2_2_2_1","volume-title":"End-to-End Referring Video Object Segmentation with Multimodal Transformers. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR","author":"Botach Adam","year":"2022","unstructured":"Adam Botach, Evgenii Zheltonozhskii, and Chaim Baskin. 2022. End-to-End Referring Video Object Segmentation with Multimodal Transformers. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022. IEEE, 4975--4985."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16192"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.363"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2303.05309"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.2968484"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19-1423"},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the 31th International Conference on Machine Learning, ICML 2014 (JMLR Workshop and Conference Proceedings","volume":"655","author":"Donahue Jeff","year":"2014","unstructured":"Jeff Donahue, Yangqing Jia, Oriol Vinyals, Judy Hoffman, Ning Zhang, Eric Tzeng, and Trevor Darrell. 2014. DeCAF: A Deep Convolutional Activation Feature for Generic Visual Recognition. In Proceedings of the 31th International Conference on Machine Learning, ICML 2014 (JMLR Workshop and Conference Proceedings, Vol. 32). JMLR.org, 647--655."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00107"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-58347-1_10"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00624"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_13_1","volume-title":"One-Shot Adaptation of Supervised Deep Convolutional Models. In 2nd International Conference on Learning Representations, ICLR","author":"Hoffman Judy","year":"2014","unstructured":"Judy Hoffman, Eric Tzeng, Jeff Donahue, Yangqing Jia, Kate Saenko, and Trevor Darrell. 2014. One-Shot Adaptation of Supervised Deep Convolutional Models. In 2nd International Conference on Learning Representations, ICLR 2014, Yoshua Bengio and Yann LeCun (Eds.)."},{"key":"e_1_3_2_2_14_1","volume-title":"Proceedings of the 35th International Conference on Machine Learning, ICML 2018 (Proceedings of Machine Learning Research","volume":"2003","author":"Hoffman Judy","year":"2018","unstructured":"Judy Hoffman, Eric Tzeng, Taesung Park, Jun-Yan Zhu, Phillip Isola, Kate Saenko, Alexei A. Efros, and Trevor Darrell. 2018. CyCADA: Cycle-Consistent Adversarial Domain Adaptation. In Proceedings of the 35th International Conference on Machine Learning, ICML 2018 (Proceedings of Machine Learning Research, Vol. 80), Jennifer G. Dy and Andreas Krause (Eds.). PMLR, 1994--2003."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"e_1_3_2_2_16_1","volume-title":"Utilizing Large Scale Vision and Text Datasets for Image Segmentation from Referring Expressions. CoRR","author":"Hu Ronghang","year":"2016","unstructured":"Ronghang Hu, Marcus Rohrbach, Subhashini Venugopalan, and Trevor Darrell. 2016b. Utilizing Large Scale Vision and Text Datasets for Image Segmentation from Referring Expressions. CoRR, Vol. abs\/1608.08305 (2016). arxiv: 1608.08305"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00448"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"e_1_3_2_2_19_1","volume-title":"Locate then Segment: A Strong Pipeline for Referring Image Segmentation. CoRR","author":"Jing Ya","year":"2021","unstructured":"Ya Jing, Tao Kong, Wei Wang, Liang Wang, Lei Li, and Tieniu Tan. 2021. Locate then Segment: A Strong Pipeline for Referring Image Segmentation. CoRR, Vol. abs\/2103.16284 (2021). arxiv: 2103.16284"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3029948"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/d14-1086"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20870-7_8"},{"key":"e_1_3_2_2_23_1","volume-title":"7th International Conference on Learning Representations, ICLR","author":"Lee Kuan-Hui","year":"2019","unstructured":"Kuan-Hui Lee, Germ\u00e1 n Ros, Jie Li, and Adrien Gaidon. 2019. SPIGAN: Privileged Adversarial Learning from Simulation. In 7th International Conference on Learning Representations, ICLR 2019. OpenReview.net."},{"key":"e_1_3_2_2_24_1","volume-title":"FNet: Mixing Tokens with Fourier Transforms. CoRR","author":"Lee-Thorp James","year":"2021","unstructured":"James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, and Santiago Onta n \u00f3 n. 2021. FNet: Mixing Tokens with Fourier Transforms. CoRR, Vol. abs\/2105.03824 (2021). showeprint[arXiv]2105.03824"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00602"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00686"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.2995122"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.143"},{"key":"e_1_3_2_2_29_1","volume-title":"Proceedings of the 32nd International Conference on Machine Learning, ICML 2015 (JMLR Workshop and Conference Proceedings","volume":"105","author":"Long Mingsheng","unstructured":"Mingsheng Long, Yue Cao, Jianmin Wang, and Michael I. Jordan. 2015. Learning Transferable Features with Deep Adaptation Networks. In Proceedings of the 32nd International Conference on Machine Learning, ICML 2015 (JMLR Workshop and Conference Proceedings, Vol. 37), Francis R. Bach and David M. Blei (Eds.). JMLR.org, 97--105."},{"key":"e_1_3_2_2_30_1","volume-title":"Proceedings of the 34th International Conference on Machine Learning,ICML 2017 (Proceedings of Machine Learning Research","volume":"2217","author":"Long Mingsheng","unstructured":"Mingsheng Long, Han Zhu, Jianmin Wang, and Michael I. Jordan. 2017. Deep Transfer Learning with Joint Adaptation Networks. In Proceedings of the 34th International Conference on Machine Learning,ICML 2017 (Proceedings of Machine Learning Research, Vol. 70), Doina Precup and Yee Whye Teh (Eds.). PMLR, 2208--2217."},{"key":"e_1_3_2_2_31_1","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada. 13--23."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00688"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00261"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_39"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00473"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/132"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00138"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2942480"},{"key":"e_1_3_2_2_40_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748--8763. http:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_2_41_1","volume-title":"Proceedings, Part II (Lecture Notes in Computer Science","volume":"118","author":"Richter Stephan R.","year":"2016","unstructured":"Stephan R. Richter, Vibhav Vineet, Stefan Roth, and Vladlen Koltun. 2016. Playing for Data: Ground Truth from Computer Games. In Computer Vision - ECCV 2016 - 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part II (Lecture Notes in Computer Science, Vol. 9906), Bastian Leibe, Jiri Matas, Nicu Sebe, and Max Welling (Eds.). Springer, 102--118."},{"key":"e_1_3_2_2_42_1","volume-title":"L\u00f3 pez","author":"Ros Germ\u00e1n","year":"2016","unstructured":"Germ\u00e1n Ros, Laura Sellart, Joanna Materzynska, David V\u00e1zquez, and Antonio M. L\u00f3 pez. 2016. The SYNTHIA Dataset: A Large Collection of Synthetic Images for Semantic Segmentation of Urban Scenes. In 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR. IEEE Computer Society, 3234--3243."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00392"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_13"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_3"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2991504"},{"key":"e_1_3_2_2_47_1","volume-title":"Whitening Sentence Representations for Better Semantics and Faster Retrieval. CoRR","author":"Su Jianlin","year":"2021","unstructured":"Jianlin Su, Jiarun Cao, Weijie Liu, and Yangyiwen Ou. 2021. Whitening Sentence Representations for Better Semantics and Faster Retrieval. CoRR, Vol. abs\/2103.15316 (2021). showeprint[arXiv]2103.15316"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00780"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p19-1656"},{"key":"e_1_3_2_2_51_1","volume-title":"Deep Domain Confusion: Maximizing for Domain Invariance. CoRR","author":"Tzeng Eric","year":"2014","unstructured":"Eric Tzeng, Judy Hoffman, Ning Zhang, Kate Saenko, and Trevor Darrell. 2014. Deep Domain Confusion: Maximizing for Domain Invariance. CoRR, Vol. abs\/1412.3474 (2014). showeprint[arXiv]1412.3474"},{"key":"e_1_3_2_2_52_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA. 5998--6008."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00746"},{"key":"e_1_3_2_2_54_1","volume-title":"Context Modulated Dynamic Networks for Actor and Action Video Segmentation with Language Queries. In The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI","author":"Wang Hao","year":"2020","unstructured":"Hao Wang, Cheng Deng, Fan Ma, and Yi Yang. 2020. Context Modulated Dynamic Networks for Actor and Action Video Segmentation with Language Queries. In The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2020. AAAI Press, 12152--12159."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00404"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2910667"},{"key":"e_1_3_2_2_57_1","volume-title":"End-to-End Video Instance Segmentation With Transformers. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021","author":"Wang Yuqing","year":"2021","unstructured":"Yuqing Wang, Zhaoliang Xu, Xinlong Wang, Chunhua Shen, Baoshan Cheng, Hao Shen, and Huaxia Xia. 2021. End-to-End Video Instance Segmentation With Transformers. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, virtual, June 19-25, 2021. Computer Vision Foundation \/ IEEE, 8741--8750."},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01198-w"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00444"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298839"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00414"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01075"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.01.035"},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2842206"},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00712"},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_18"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611879","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611879","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:24Z","timestamp":1755820884000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611879"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":68,"alternative-id":["10.1145\/3581783.3611879","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611879","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}