{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T15:45:41Z","timestamp":1776354341208,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612314","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"3807-3816","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Unsupervised Domain Adaptation for Video Object Grounding with Cascaded Debiasing Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3482-234X","authenticated-orcid":false,"given":"Mengze","family":"Li","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4632-3677","authenticated-orcid":false,"given":"Haoyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2258-1291","authenticated-orcid":false,"given":"Juncheng","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6121-0384","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University &amp; Shanghai Institute for Advanced Study of Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5988-7609","authenticated-orcid":false,"given":"Wenqiao","family":"Zhang","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0030-8289","authenticated-orcid":false,"given":"Shengyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Software Technology, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5269-7821","authenticated-orcid":false,"given":"Shiliang","family":"Pu","sequence":"additional","affiliation":[{"name":"Hikvison, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9017-2508","authenticated-orcid":false,"given":"Yueting","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2139-8807","authenticated-orcid":false,"given":"Fei","family":"Wu","sequence":"additional","affiliation":[{"name":"Shanghai Institute for Advanced Study of Zhejiang University &amp; Shanghai AI Laboratory, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN48605.2020.9207304"},{"key":"e_1_3_2_1_2_1","volume-title":"Can Zhang, and Yuexian Zou.","author":"Cao Meng","year":"2021","unstructured":"Meng Cao, Long Chen, Mike Zheng Shou, Can Zhang, and Yuexian Zou. 2021. On pursuit of designing multi-modal transformer for video grounding. arXiv preprint arXiv:2109.06085 (2021)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_4_1","volume-title":"Modular Domain Adaptation. arXiv preprint arXiv:2204.14213","author":"Chen Junshen K","year":"2022","unstructured":"Junshen K Chen, Dallas Card, and Dan Jurafsky. 2022. Modular Domain Adaptation. arXiv preprint arXiv:2204.14213 (2022)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1183"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Ross Girshick Jeff Donahue Trevor Darrell and Jitendra Malik. 2013. Rich feature hierarchies for accurate object detection and semantic segmentation. arXiv preprint arXiv:1311.2524.","DOI":"10.1109\/CVPR.2014.81"},{"key":"e_1_3_2_1_7_1","volume-title":"Meta-learning adversarial domain adaptation network for few-shot text classification. arXiv preprint arXiv:2107.12262","author":"Han Chengcheng","year":"2021","unstructured":"Chengcheng Han, Zeqiu Fan, Dongxiang Zhang, Minghui Qiu, Ming Gao, and Aoying Zhou. 2021. Meta-learning adversarial domain adaptation network for few-shot text classification. arXiv preprint arXiv:2107.12262 (2021)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548009"},{"key":"e_1_3_2_1_10_1","volume-title":"Mrtnet: Multi-resolution temporal network for video sentence grounding. arXiv preprint arXiv:2212.13163","author":"Ji Wei","year":"2022","unstructured":"Wei Ji, Long Chen, Yinwei Wei, Yiming Wu, and Tat-Seng Chua. 2022. Mrtnet: Multi-resolution temporal network for video sentence grounding. arXiv preprint arXiv:2212.13163 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612088"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02204"},{"key":"e_1_3_2_1_13_1","volume-title":"Decoupled Adaptation for Cross-Domain Object Detection. arXiv preprint arXiv:2110.02578","author":"Jiang Junguang","year":"2021","unstructured":"Junguang Jiang, Baixu Chen, Jianmin Wang, and Mingsheng Long. 2021. Decoupled Adaptation for Cross-Domain Object Detection. arXiv preprint arXiv:2110.02578 (2021)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_1_15_1","unstructured":"Juncheng Li Minghe Gao Longhui Wei Siliang Tang Wenqiao Zhang Mengze Li Wei Ji Qi Tian Tat-Seng Chua and Yueting Zhuang. 2023 a. Gradient-Regulated Meta-Prompt Learning for Generalizable Vision-Language Models. (2023)."},{"key":"e_1_3_2_1_16_1","unstructured":"Juncheng Li XIN HE Longhui Wei Long Qian Linchao Zhu Lingxi Xie Yueting Zhuang Qi Tian and Siliang Tang. 2022a. Fine-Grained Semantically Aligned Vision-Language Pre-Training. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00188"},{"key":"e_1_3_2_1_18_1","volume-title":"2023 b. Variational Cross-Graph Reasoning and Adaptive Structured Semantics Learning for Compositional Temporal Grounding","author":"Li Juncheng","year":"2023","unstructured":"Juncheng Li, Siliang Tang, Linchao Zhu, Wenqiao Zhang, Yi Yang, Tat-Seng Chua, and Fei Wu. 2023 b. Variational Cross-Graph Reasoning and Adaptive Structured Semantics Learning for Compositional Temporal Grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01214"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00304"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02211"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.254"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.596"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548333"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547990"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00294"},{"key":"e_1_3_2_1_27_1","volume-title":"Cees GM Snoek, and Arnold WM Smeulders","author":"Li Zhenyang","year":"2017","unstructured":"Zhenyang Li, Ran Tao, Efstratios Gavves, Cees GM Snoek, and Arnold WM Smeulders. 2017. Tracking by natural language specification. Piscataway, NJIEEE."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547969"},{"key":"e_1_3_2_1_29_1","volume-title":"Cycle self-training for domain adaptation. arXiv preprint arXiv:2103.03571","author":"Liu Hong","year":"2021","unstructured":"Hong Liu, Jianmin Wang, and Mingsheng Long. 2021. Cycle self-training for domain adaptation. arXiv preprint arXiv:2103.03571 (2021)."},{"key":"e_1_3_2_1_30_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_31_1","unstructured":"Ruotian Luo and Gregory Shakhnarovich. 2017. Comprehension-guided referring expressions. arXiv e-prints arXiv--1701."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475660"},{"key":"e_1_3_2_1_33_1","volume-title":"Beng Chin Ooi, and Fei Wu. 2023 a. IDEAL: Toward High-efficiency Device-Cloud Collaborative and Dynamic Recommendation System. arXiv preprint arXiv:2302.07335","author":"Lv Zheqi","year":"2023","unstructured":"Zheqi Lv, Zhengyu Chen, Shengyu Zhang, Kun Kuang, Wenqiao Zhang, Mengze Li, Beng Chin Ooi, and Fei Wu. 2023 a. IDEAL: Toward High-efficiency Device-Cloud Collaborative and Dynamic Recommendation System. arXiv preprint arXiv:2302.07335 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"Personalizing Intervened Network for Long-tailed Sequential User Behavior Modeling. arXiv preprint arXiv:2208.09130","author":"Lv Zheqi","year":"2022","unstructured":"Zheqi Lv, Feng Wang, Shengyu Zhang, Kun Kuang, Hongxia Yang, and Fei Wu. 2022. Personalizing Intervened Network for Long-tailed Sequential User Behavior Modeling. arXiv preprint arXiv:2208.09130 (2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583451"},{"key":"e_1_3_2_1_36_1","volume-title":"Sentry: Selective entropy optimization via committee consistency for unsupervised domain adaptation. arXiv preprint arXiv:2012.11460","author":"Prabhu Viraj","year":"2020","unstructured":"Viraj Prabhu, Shivam Khare, Deeksha Kartik, and Judy Hoffman. 2020. Sentry: Selective entropy optimization via committee consistency for unsupervised domain adaptation. arXiv preprint arXiv:2012.11460 (2020)."},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies. Association for Computational Linguistics","author":"Qi Peng","year":"2018","unstructured":"Peng Qi, Timothy Dozat, Yuhao Zhang, and Christopher D. Manning. 2018. Universal Dependency Parsing from Scratch. In Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies. Association for Computational Linguistics, Brussels, Belgium, 160--170. https:\/\/nlp.stanford.edu\/pubs\/qi2018universal.pdf"},{"key":"e_1_3_2_1_38_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. arXiv preprint arXiv:1506.01497","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. arXiv preprint arXiv:1506.01497 (2015)."},{"key":"e_1_3_2_1_39_1","volume-title":"Video object grounding using semantic roles in language description. arXiv preprint arXiv:2003.10606","author":"Sadhu Arka","year":"2020","unstructured":"Arka Sadhu, Kan Chen, and Ram Nevatia. 2020. Video object grounding using semantic roles in language description. arXiv preprint arXiv:2003.10606 (2020)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00156"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548316"},{"key":"e_1_3_2_1_42_1","volume-title":"Human-centric spatio-temporal video grounding with visual transformers. arXiv preprint arXiv:2011.05049","author":"Tang Zongheng","year":"2020","unstructured":"Zongheng Tang, Yue Liao, Si Liu, Guanbin Li, Xiaojie Jin, Hongxu Jiang, Qian Yu, and Dong Xu. 2020. Human-centric spatio-temporal video grounding with visual transformers. arXiv preprint arXiv:2011.05049 (2020)."},{"key":"e_1_3_2_1_43_1","volume-title":"Object referring in videos with language and human gaze. arXiv preprint arXiv:1801.01582","author":"Vasudevan Arun Balajee","year":"2018","unstructured":"Arun Balajee Vasudevan, Dengxin Dai, and Luc Van Gool. 2018. Object referring in videos with language and human gaze. arXiv preprint arXiv:1801.01582 (2018)."},{"key":"e_1_3_2_1_44_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547752"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.56"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.797"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01595"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548426"},{"key":"e_1_3_2_1_50_1","volume-title":"Prototypical Pseudo Label Denoising and Target Structure Learning for Domain Adaptive Semantic Segmentation. arXiv preprint arXiv:2101.10979","author":"Zhang Pan","year":"2021","unstructured":"Pan Zhang, Bo Zhang, Ting Zhang, Dong Chen, Yong Wang, and Fang Wen. 2021b. Prototypical Pseudo Label Denoising and Target Structure Learning for Domain Adaptive Semantic Segmentation. arXiv preprint arXiv:2101.10979 (2021)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548328"},{"key":"e_1_3_2_1_52_1","volume-title":"BOSS: Bottom-up Cross-modal Semantic Composition with Hybrid Counterfactual Training for Robust Content-based Image Retrieval. arXiv preprint arXiv:2207.04211","author":"Zhang Wenqiao","year":"2022","unstructured":"Wenqiao Zhang, Jiannan Guo, Mengze Li, Haochen Shi, Shengyu Zhang, Juncheng Li, Siliang Tang, and Yueting Zhuang. 2022a. BOSS: Bottom-up Cross-modal Semantic Composition with Hybrid Counterfactual Training for Robust Content-based Image Retrieval. arXiv preprint arXiv:2207.04211 (2022)."},{"key":"e_1_3_2_1_53_1","volume-title":"MAGIC: Multimodal relAtional Graph adversarIal inferenCe for Diverse and Unpaired Text-based Image Captioning. arXiv preprint arXiv:2112.06558","author":"Zhang Wenqiao","year":"2021","unstructured":"Wenqiao Zhang, Haochen Shi, Jiannan Guo, Shengyu Zhang, Qingpeng Cai, Juncheng Li, Sihui Luo, and Yueting Zhuang. 2021a. MAGIC: Multimodal relAtional Graph adversarIal inferenCe for Diverse and Unpaired Text-based Image Captioning. arXiv preprint arXiv:2112.06558 (2021)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2935678"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413746"},{"key":"e_1_3_2_1_56_1","volume-title":"BoostMIS: Boosting Medical Image Semi-supervised Learning with Adaptive Pseudo Labeling and Informative Active Annotation. arXiv preprint arXiv:2203.02533","author":"Zhang Wenqiao","year":"2022","unstructured":"Wenqiao Zhang, Lei Zhu, James Hallinan, Andrew Makmur, Shengyu Zhang, Qingpeng Cai, and Beng Chin Ooi. 2022d. BoostMIS: Boosting Medical Image Semi-supervised Learning with Adaptive Pseudo Labeling and Informative Active Annotation. arXiv preprint arXiv:2203.02533 (2022)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547987"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Zhu Zhang Zhou Zhao Yang Zhao Qi Wang Huasheng Liu and Lianli Gao. 2020b. Where does it exist: Spatio-temporal video grounding for multi-form sentences. arXiv e-prints arXiv--2001.","DOI":"10.1109\/CVPR42600.2020.01068"},{"key":"e_1_3_2_1_59_1","volume-title":"Grounded video description. arXiv preprint arXiv:1812.06587","author":"Zhou Luowei","year":"2018","unstructured":"Luowei Zhou, Yannis Kalantidis, Xinlei Chen, Jason J Corso, and Marcus Rohrbach. 2018. Grounded video description. arXiv preprint arXiv:1812.06587 (2018)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612314","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612314","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:56:25Z","timestamp":1755820585000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612314"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":59,"alternative-id":["10.1145\/3581783.3612314","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612314","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}