{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:14:41Z","timestamp":1765008881935,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024YFB4506000"],"award-info":[{"award-number":["2024YFB4506000"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100017052","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62502140, 62372023"],"award-info":[{"award-number":["62502140, 62372023"]}],"id":[{"id":"10.13039\/100017052","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LQN25F020028"],"award-info":[{"award-number":["LQN25F020028"]}]},{"name":"Open Funding of State Key Laboratory of Intelligent Coal Mining and Strata Control","award":["SKLIS202405"],"award-info":[{"award-number":["SKLIS202405"]}]},{"name":"Research Start-up Funds of Hangzhou International Innovation Institute of Beihang University","award":["2024KQ087"],"award-info":[{"award-number":["2024KQ087"]}]},{"name":"Open Fund of the State Key Laboratory of Software Development Environment","award":["SKLSDE-2023ZX-11"],"award-info":[{"award-number":["SKLSDE-2023ZX-11"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3771047","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:06:16Z","timestamp":1765008376000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["InstructTrack: Language-Guided Multi-Object Tracking with Semantic-Aware Association"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-2539-9686","authenticated-orcid":false,"given":"Zishun","family":"Zhou","sequence":"first","affiliation":[{"name":"State Key Laboratory of Virtual Reality Technology and Systems, School of Computer Science and Engineering, Beihang University, Beijing, China and Key Laboratory of Data Science and Intelligent Computing, Hangzhou International Innovation Institute, Beihang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1570-6570","authenticated-orcid":false,"given":"Shuai","family":"Wang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Data Science and Intelligent Computing, Hangzhou International Innovation Institute, Beihang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2811-8962","authenticated-orcid":false,"given":"Hao","family":"Sheng","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Virtual Reality Technology and Systems, School of Computer Science and Engineering, Beihang University, Beijing, China; Key Laboratory of Data Science and Intelligent Computing, Hangzhou International Innovation Institute, Beihang University, Hangzhou, Zhejiang, China and Faculty of Applied Sciences, Macao Polytechnic University, Macao, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1317-540X","authenticated-orcid":false,"given":"Dazhi","family":"Yang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Data Science and Intelligent Computing, Hangzhou International Innovation Institute, Beihang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8583-3906","authenticated-orcid":false,"given":"Sentan","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of Data Science and Intelligent Computing, Hangzhou International Innovation Institute, Beihang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5782-894X","authenticated-orcid":false,"given":"Da","family":"Yang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Data Science and Intelligent Computing, Hangzhou International Innovation Institute, Beihang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4796-6382","authenticated-orcid":false,"given":"Zhenglong","family":"Cui","sequence":"additional","affiliation":[{"name":"Key Laboratory of Data Science and Intelligent Computing, Hangzhou International Innovation Institute, Beihang University, Hangzhou, Zhejiang, China"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Nir Aharon Roy Orfaig and Ben-Zion Bobrovsky. 2022. BoT-SORT: Robust associations multi-pedestrian tracking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2206.14651 (2022)."},{"key":"e_1_3_3_1_3_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang and Jun Tang. 2025. Qwen2.5-VL Technical Report. (2025)."},{"key":"e_1_3_3_1_4_2","unstructured":"Weijie Dai Junnan Li Dongxu Li Ang Tiong Jiawei Zhao Weiqing Wang and Steven Hoi. 2023. InstructBLIP: Towards general-purpose vision-language models with instruction tuning. Advances in Neural Information Processing Systems (NeurIPS) 36 (2023) 49250\u201349267."},{"key":"e_1_3_3_1_5_2","unstructured":"Patrick Dendorfer Hamid Rezatofighi Anton Milan Javen Shi Daniel Cremers Ian Reid Stefan Roth Konrad Schindler and Laura Leal-Taix\u00e9. 2020. MOT20: A benchmark for multi object tracking in crowded scenes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2003.09003 (2020). https:\/\/motchallenge.net\/data\/MOT20\/ Online at MOTChallenge; https:\/\/arxiv.org\/abs\/2003.09003."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Yunhao Du Zheng Zhao Yang Song Yanyun Zhao Fumin Su Tao Gong and Hongying Meng. 2023. StrongSORT: Make DeepSORT Great Again. IEEE Transactions on Multimedia 25 (2023) 8725\u20138737.","DOI":"10.1109\/TMM.2023.3240881"},{"key":"e_1_3_3_1_7_2","unstructured":"Zheng Ge Song Liu Feng Wang Zeming Li and Jian Sun. 2021. YOLOX: Exceeding YOLO Series in 2021. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2107.08430 (2021)."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58595-2_24"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00735"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01825"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Bernardin Keni and Stiefelhagen Rainer. 2008. Evaluating multiple object tracking performance: The CLEAR MOT metrics. EURASIP Journal on Image and Video Processing 2008 1 (2008).","DOI":"10.1155\/2008\/246309"},{"key":"e_1_3_3_1_12_2","series-title":"(ICML\u201923)","first-page":"19730","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of the 40th International Conference on Machine Learning(ICML\u201923). PMLR, 19730\u201319742. arxiv:https:\/\/arXiv.org\/abs\/2301.12597https:\/\/proceedings.mlr.press\/v202\/li23n.html"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00406"},{"key":"e_1_3_3_1_14_2","unstructured":"Yuhao Li Jiale Cao Muzammal Naseer Yu Zhu Jinqiu Sun Yanning Zhang and Fahad\u00a0Shahbaz Khan. 2025. Multi-Granularity Language-Guided Training for Multi-Object Tracking. IEEE Transactions on Circuits and Systems for Video Technology (2025)."},{"key":"e_1_3_3_1_15_2","unstructured":"Yunhao Li Xiaoqiong Liu Luke Liu Heng Fan and Libo Zhang. 2024. Lamot: Language-guided multi-object tracking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.08324 (2024)."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Chao Liang Zhipeng Zhang Xue Zhou Bing Li Shuyuan Zhu and Weiming Hu. 2022. Rethinking the competition between detection and reid in multiobject tracking. IEEE Transactions on Image Processing 31 (2022) 3182\u20133196.","DOI":"10.1109\/TIP.2022.3165376"},{"key":"e_1_3_3_1_17_2","first-page":"38","volume-title":"European Conference on Computer Vision (ECCV)","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Fangchen Li, Haoyi Zhang, Jingkang Yang, and Lei Zhang. 2024. Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection. In European Conference on Computer Vision (ECCV). Springer, Springer Nature Switzerland, 38\u201355."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","unstructured":"Jonathon Luiten Aljosa Osep Patrick Dendorfer Philip H.\u00a0S. Torr Andreas Geiger Laura Leal-Taix\u00e9 and Bastian Leibe. 2021. HOTA: A Higher Order Metric for Evaluating Multi-Object Tracking. International Journal of Computer Vision 129 2 (2021) 548\u2013578. 10.1007\/s11263-020-01469-6","DOI":"10.1007\/s11263-020-01469-6"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00864"},{"key":"e_1_3_3_1_20_2","unstructured":"Anton Milan Laura Leal-Taix\u00e9 Ian Reid Stefan Roth and Konrad Schindler. 2016. MOT16: A benchmark for multi-object tracking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1603.00831 (2016)."},{"key":"e_1_3_3_1_21_2","unstructured":"Shuai Shao Zhe Zhao Boxun Li Tianfeng Xiao Gang Yu Xiangyu Zhang and Jian Sun. 2018. CrowdHuman: A benchmark for detecting human in a crowd. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1805.00123 (2018)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01091"},{"key":"e_1_3_3_1_23_2","unstructured":"Peize Sun Yi Jiang Rufeng Zhang Enze Xie and Ping Luo. 2020. TransTrack: Multiple-Object Tracking with Transformer. (2020)."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01068"},{"key":"e_1_3_3_1_25_2","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang and Wenbin Ge. 2024. Qwen2-VL: Enhancing Vision-Language Model\u2019s Perception of the World at Any Resolution. (2024)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2017.8296962"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_38"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00380"}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3771047","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:11:25Z","timestamp":1765008685000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3771047"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":28,"alternative-id":["10.1145\/3743093.3771047","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3771047","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}