{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:07:36Z","timestamp":1775326056361,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100004608","name":"Natural Science Foundation of Jiangsu Province","doi-asserted-by":"publisher","award":["No.BK20191258"],"award-info":[{"award-number":["No.BK20191258"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100004608","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Nanjing Purple Mountain Laboratories"},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62172089, No.61972087, No.62172090, No.62106045"],"award-info":[{"award-number":["No. 62172089, No.61972087, No.62172090, No.62106045"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Jiangsu Provincial Key Laboratory of Computer Networking Technology"},{"name":"Jiangsu Provincial Key Laboratory of Network and Information Security","award":["No. BM2003201"],"award-info":[{"award-number":["No. BM2003201"]}]},{"name":"Key Laboratory of Computer Network and Information Integration of Ministry of Education of China","award":["No. 93K-9"],"award-info":[{"award-number":["No. 93K-9"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680657","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"1895-1904","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Consistencies are All You Need for Semi-supervised Vision-Language Tracking"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7268-7815","authenticated-orcid":false,"given":"Jiawei","family":"Ge","sequence":"first","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2448-6717","authenticated-orcid":false,"given":"Jiuxin","family":"Cao","sequence":"additional","affiliation":[{"name":"Southeast University &amp; Purple Mountain Laboratories, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7676-2843","authenticated-orcid":false,"given":"Xuelin","family":"Zhu","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2838-1445","authenticated-orcid":false,"given":"Xinyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3865-2434","authenticated-orcid":false,"given":"Chang","family":"Liu","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6735-7667","authenticated-orcid":false,"given":"Kun","family":"Wang","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5209-9063","authenticated-orcid":false,"given":"Bo","family":"Liu","sequence":"additional","affiliation":[{"name":"Southeast University &amp; Purple Mountain Laboratories, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298799"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-48881-3_56"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00264"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.733"},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00552"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093425"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00579"},{"key":"e_1_3_2_1_12_1","volume-title":"Beyond Visual Cues: Synchronously Exploring Target-Centric Semantics for Vision-Language Tracking. arXiv preprint arXiv:2311.17085","author":"Ge Jiawei","year":"2023","unstructured":"Jiawei Ge, Xiangmei Chen, Jiuxin Cao, Xuelin Zhu, Weijia Liu, and Bo Liu. 2023. Beyond Visual Cues: Synchronously Exploring Target-Centric Semantics for Vision-Language Tracking. arXiv preprint arXiv:2311.17085 (2023)."},{"key":"e_1_3_2_1_13_1","unstructured":"Mingzhe Guo Zhipeng Zhang Heng Fan and Liping Jing. [n. d.]. Divert More Attention to Vision-Language Tracking. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2345390"},{"key":"e_1_3_2_1_15_1","volume-title":"International conference on machine learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"e_1_3_2_1_17_1","volume-title":"Chang Wen Chen, and Qing Li","author":"Jiang Yiyang","year":"2024","unstructured":"Yiyang Jiang, Wengyu Zhang, Xulu Zhang, Xiaoyong Wei, Chang Wen Chen, and Qing Li. 2024. Prior Knowledge Integration via LLM Encoding and Pseudo Event Regulation for Video Moment Retrieval. arxiv: 2407.15051 [cs.CV] https:\/\/arxiv.org\/abs\/2407.15051"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00577"},{"key":"e_1_3_2_1_19_1","volume-title":"IEEE transactions on pattern analysis and machine intelligence","author":"Kalal Zdenek","year":"2011","unstructured":"Zdenek Kalal, Krystian Mikolajczyk, and Jiri Matas. 2011. Tracking-learning-detection. IEEE transactions on pattern analysis and machine intelligence, Vol. 34, 7 (2011), 1409--1422."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.3390\/app11062646"},{"key":"e_1_3_2_1_21_1","volume-title":"International conference on machine learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning. PMLR, 5583--5594."},{"key":"e_1_3_2_1_22_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David A Shamma et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision Vol. 123 (2017) 32--73.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_24_1","volume-title":"Workshop on challenges in representation learning, ICML","volume":"3","author":"Dong-Hyun","unstructured":"Dong-Hyun Lee et al. 2013. Pseudo-label: The simple and efficient semi-supervised learning method for deep neural networks. In Workshop on challenges in representation learning, ICML, Vol. 3. Atlanta, 896."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 5088--5096","author":"Lee Dae-Youn","year":"2015","unstructured":"Dae-Youn Lee, Jae-Young Sim, and Chang-Su Kim. 2015. Multihypothesis trajectory analysis for robust visual tracking. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 5088--5096."},{"key":"e_1_3_2_1_26_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00626"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01146"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings, Part XXX 16","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00540"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.777"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740--755."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01469"},{"key":"e_1_3_2_1_35_1","volume-title":"Kd-vlp: Improving end-to-end vision-and-language pretraining with object knowledge distillation. arXiv preprint arXiv:2109.10504","author":"Liu Yongfei","year":"2021","unstructured":"Yongfei Liu, Chenfei Wu, Shao-yen Tseng, Vasudev Lal, Xuming He, and Nan Duan. 2021. Kd-vlp: Improving end-to-end vision-and-language pretraining with object knowledge distillation. arXiv preprint arXiv:2109.10504 (2021)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_37_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01032"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475349"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_19"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSC.2019.2916416"},{"key":"e_1_3_2_1_42_1","volume-title":"Beit v2: Masked image modeling with vector-quantized visual tokenizers. arXiv","author":"Peng Z","year":"2022","unstructured":"Z Peng, L Dong, H Bao, Q Ye, and F Wei. [n.,d.]. Beit v2: Masked image modeling with vector-quantized visual tokenizers. arXiv 2022. arXiv preprint arXiv:2208.06366 ( [n.,d.])."},{"key":"e_1_3_2_1_43_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"e_1_3_2_1_45_1","volume-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"Ren Shaoqing","year":"2016","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2016. Faster R-CNN: Towards real-time object detection with region proposal networks. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 6 (2016), 1137--1149."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413611"},{"key":"e_1_3_2_1_47_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_48_1","volume-title":"A survey on semi-supervised learning. Machine learning","author":"Van Engelen Jesper E","year":"2020","unstructured":"Jesper E Van Engelen and Holger H Hoos. 2020. A survey on semi-supervised learning. Machine learning, Vol. 109, 2 (2020), 373--440."},{"key":"e_1_3_2_1_49_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_50_1","volume-title":"International Conference on Machine Learning. PMLR, 36138--36156","author":"Wang Jianfeng","year":"2023","unstructured":"Jianfeng Wang, Daniela Massiceti, Xiaolin Hu, Vladimir Pavlovic, and Thomas Lukasiewicz. 2023. NP-SemiSeg: when neural processes meet semi-supervised semantic segmentation. In International Conference on Machine Learning. PMLR, 36138--36156."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00140"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01357-4"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3069908"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"e_1_3_2_1_55_1","volume-title":"E2E-VLP: End-to-end vision-language pre-training enhanced by visual learning. arXiv preprint arXiv:2106.01804","author":"Xu Haiyang","year":"2021","unstructured":"Haiyang Xu, Ming Yan, Chenliang Li, Bin Bi, Songfang Huang, Wenming Xiao, and Fei Huang. 2021. E2E-VLP: End-to-end vision-language pre-training enhanced by visual learning. arXiv preprint arXiv:2106.01804 (2021)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3038720"},{"key":"e_1_3_2_1_57_1","volume-title":"X 2-vlm: All-in-one pre-trained model for vision-language tasks","author":"Zeng Yan","year":"2023","unstructured":"Yan Zeng, Xinsong Zhang, Hang Li, Jiawei Wang, Jipeng Zhang, and Wangchunshu Zhou. 2023. X 2-vlm: All-in-one pre-trained model for vision-language tasks. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01309"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01329"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02217"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548343"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00142"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680657","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680657","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:57Z","timestamp":1750295877000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680657"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":63,"alternative-id":["10.1145\/3664647.3680657","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680657","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}