{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:20:17Z","timestamp":1775326817431,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the FuXiaQuan National Independent Innovation Demonstration Zone Collaborative Innovation Platform Project under Grant","award":["3502ZCQXT2022008"],"award-info":[{"award-number":["3502ZCQXT2022008"]}]},{"name":"the National Natural Science Foundation of China under Grants","award":["U21A20514, 62372388, 62071404"],"award-info":[{"award-number":["U21A20514, 62372388, 62071404"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681530","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"2457-2466","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["GLATrack: Global and Local Awareness for Open-Vocabulary Multiple Object Tracking"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2510-6378","authenticated-orcid":false,"given":"Guangyao","family":"Li","sequence":"first","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University &amp; Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3383-7791","authenticated-orcid":false,"given":"Yajun","family":"Jian","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University &amp; Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3383-7791","authenticated-orcid":false,"given":"Yajun","family":"Jian","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University &amp; Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3674-7160","authenticated-orcid":false,"given":"Yan","family":"Yan","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University &amp; Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3674-7160","authenticated-orcid":false,"given":"Yan","family":"Yan","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University &amp; Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6913-9786","authenticated-orcid":false,"given":"Hanzi","family":"Wang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University &amp; Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6913-9786","authenticated-orcid":false,"given":"Hanzi","family":"Wang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University &amp; Fujian Key Laboratory of Sensing and Computing for Smart City, School of Informatics, Xiamen University, Xiamen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587583"},{"key":"e_1_3_2_1_2_1","first-page":"33781","article-title":"Bridging the gap between object and image-level representations for open-vocabulary detection","volume":"35","author":"Bangalath Hanoona","year":"2022","unstructured":"Hanoona Bangalath, Muhammad Maaz, Muhammad Uzair Khattak, Salman H Khan, and Fahad Shahbaz Khan. 2022. Bridging the gap between object and image-level representations for open-vocabulary detection. Advances in Neural Information Processing Systems, Vol. 35 (2022), 33781--33794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00103"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2016.7533003"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00628"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00934"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02252"},{"key":"e_1_3_2_1_8_1","volume-title":"Wooyoung Kang, and Byungseok Roh.","author":"Cho Han-Cheol","year":"2023","unstructured":"Han-Cheol Cho, Won Young Jhoo, Wooyoung Kang, and Byungseok Roh. 2023. Open-Vocabulary Object Detection using Pseudo Caption Labels. arXiv preprint arXiv:2303.13040 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00910"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_26"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01393-0"},{"key":"e_1_3_2_1_12_1","volume-title":"1st place solution to eccv-tao-2020: Detect and represent any object for tracking. arXiv preprint arXiv:2101.08040","author":"Du Fei","year":"2021","unstructured":"Fei Du, Bo Xu, Jiasheng Tang, Yuqi Zhang, Fan Wang, and Hao Li. 2021. 1st place solution to eccv-tao-2020: Detect and represent any object for tracking. arXiv preprint arXiv:2101.08040 (2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3301975"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611728"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2938758"},{"key":"e_1_3_2_1_17_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01269"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1561\/9781680836899"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_29"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00539"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01846"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01375-2"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 14074--14083","author":"Ma Zongyang","year":"2022","unstructured":"Zongyang Ma, Guan Luo, Jin Gao, Liang Li, Yuxin Chen, Shaoru Wang, Congxuan Zhang, and Weiming Hu. 2022. Open-vocabulary one-stage detection with hierarchical visual-language knowledge distillation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 14074--14083."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00864"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"M Minderer A Gritsenko A Stone M Neumann D Weissenborn A Dosovitskiy A Mahendran A Arnab M Dehghani Z Shen et al. 2022. Simple open-vocabulary object detection with vision transformers. arXiv preprint arXiv:2205.06230 Vol. 2 (2022).","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995586"},{"key":"e_1_3_2_1_30_1","volume-title":"IEEE International Conference on Robotics and Automation. 3494--3501","author":"Ovsep Aljovsa","year":"2018","unstructured":"Aljovsa Ovsep, Wolfgang Mehner, Paul Voigtlaender, and Bastian Leibe. 2018. Track, then decide: Category-agnostic vision-based multi-object tracking. In IEEE International Conference on Robotics and Automation. 3494--3501."},{"key":"e_1_3_2_1_31_1","volume-title":"IEEE International Conference on Robotics and Automation. 10031--10037","author":"Ovsep Aljovsa","year":"2020","unstructured":"Aljovsa Ovsep, Paul Voigtlaender, Mark Weber, Jonathon Luiten, and Bastian Leibe. 2020. 4d generic video object proposals. In IEEE International Conference on Robotics and Automation. 10031--10037."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3416283"},{"key":"e_1_3_2_1_33_1","volume-title":"International Conference on Machine Learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. 8748--8763."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01086"},{"key":"e_1_3_2_1_35_1","first-page":"91","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","volume":"28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in Neural Information Processing Systems, Vol. 28 (2015), 91--99.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01410"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3416297"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"e_1_3_2_1_40_1","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in Neural Information Processing Systems, Vol. 30 (2017), 5998--6008.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475304"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00375"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01076"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548231"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2017.8296962"},{"key":"e_1_3_2_1_46_1","volume-title":"So Kweon, and Joon-Young Lee.","author":"Woo Sanghyun","year":"2022","unstructured":"Sanghyun Woo, Kwanyong Park, Seoung Wug Oh, In So Kweon, and Joon-Young Lee. 2022. Bridging images and videos: A simple learning framework for large vocabulary video object detection. In Proceedings of the European Conference on Computer Vision. 238--258."},{"key":"e_1_3_2_1_47_1","volume-title":"So Kweon, and Joon-Young Lee. 2022. Tracking by associating clips. In Proceedings of the European Conference on Computer Vision. 129--145","author":"Woo Sanghyun","unstructured":"Sanghyun Woo, Kwanyong Park, Seoung Wug Oh, In So Kweon, and Joon-Young Lee. 2022. Tracking by associating clips. In Proceedings of the European Conference on Computer Vision. 129--145."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3416282"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00409"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00529"},{"key":"e_1_3_2_1_53_1","volume-title":"Aim: Adapting image models for efficient video action recognition. arXiv preprint arXiv:2302.03024","author":"Yang Taojiannan","year":"2023","unstructured":"Taojiannan Yang, Yi Zhu, Yusheng Xie, Aston Zhang, Chen Chen, and Mu Li. 2023. Aim: Adapting image models for efficient video action recognition. arXiv preprint arXiv:2302.03024 (2023)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611868"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_38"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3481546"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01513-4"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00214"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02112"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25459"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_28"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00857"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681530","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681530","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:48Z","timestamp":1750294668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681530"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":68,"alternative-id":["10.1145\/3664647.3681530","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681530","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}