{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:05:53Z","timestamp":1765343153992,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755617","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"8682-8691","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["End-to-End Multiple Object Tracking with Dynamic Scene Perception"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2562-6021","authenticated-orcid":false,"given":"Ruonan","family":"Wei","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5232-008X","authenticated-orcid":false,"given":"Yuntao","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3762-0479","authenticated-orcid":false,"given":"Siyan","family":"Fang","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan City, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7046-7587","authenticated-orcid":false,"given":"Yuehuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1155\/2008\/246309"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2016.7533003"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00934"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00672"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3301975"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611728"},{"key":"e_1_3_2_1_8_1","volume-title":"Multiple object tracking as id prediction. arXiv preprint arXiv:2403.16848","author":"Gao Ruopeng","year":"2024","unstructured":"Ruopeng Gao, Ji Qi, and Limin Wang. 2024a. Multiple object tracking as id prediction. arXiv preprint arXiv:2403.16848 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00908"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3364828"},{"key":"e_1_3_2_1_11_1","volume-title":"Yolox: Exceeding yolo series in","author":"Ge Z","year":"2021","unstructured":"Z Ge. 2021. Yolox: Exceeding yolo series in 2021. arXiv preprint arXiv:2107.08430 (2021)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681530"},{"key":"e_1_3_2_1_14_1","volume-title":"OVTR: End-to-End Open-Vocabulary Multiple Object Tracking with Transformer. arXiv preprint arXiv:2503.10616","author":"Li Jinyang","year":"2025","unstructured":"Jinyang Li, En Yu, Sijia Chen, and Wenbing Tao. 2025. OVTR: End-to-End Open-Vocabulary Multiple Object Tracking with Transformer. arXiv preprint arXiv:2503.10616 (2025)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00060"},{"key":"e_1_3_2_1_16_1","first-page":"740","volume-title":"Switzerland","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer vision-ECCV 2014: 13th European conference, zurich, Switzerland, September 6-12, 2014, proceedings, part v 13. Springer, 740-755."},{"key":"e_1_3_2_1_17_1","volume-title":"Dab-detr: Dynamic anchor boxes are better queries for detr. arXiv preprint arXiv:2201.12329","author":"Liu Shilong","year":"2022","unstructured":"Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, and Lei Zhang. 2022. Dab-detr: Dynamic anchor boxes are better queries for detr. arXiv preprint arXiv:2201.12329 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"HOTA: A higher order metric for evaluating multi-object tracking. International journal of computer vision","author":"Luiten Jonathon","year":"2021","unstructured":"Jonathon Luiten, Aljosa Osep, Patrick Dendorfer, Philip Torr, Andreas Geiger, Laura Leal-Taix\u00e9, and Bastian Leibe. 2021. HOTA: A higher order metric for evaluating multi-object tracking. International journal of computer vision, Vol. 129 (2021), 548-578."},{"key":"e_1_3_2_1_19_1","volume-title":"Multiple object tracking: A literature review. Artificial intelligence","author":"Luo Wenhan","year":"2021","unstructured":"Wenhan Luo, Junliang Xing, Anton Milan, Xiaoqin Zhang, Wei Liu, and Tae-Kyun Kim. 2021. Multiple object tracking: A literature review. Artificial intelligence, Vol. 293 (2021), 103448."},{"key":"e_1_3_2_1_20_1","volume-title":"Jrdb: A dataset and benchmark of egocentric robot visual perception of humans in built environments","author":"Martin-Martin Roberto","year":"2021","unstructured":"Roberto Martin-Martin, Mihir Patel, Hamid Rezatofighi, Abhijeet Shenoi, JunYoung Gwak, Eric Frankel, Amir Sadeghian, and Silvio Savarese. 2021. Jrdb: A dataset and benchmark of egocentric robot visual perception of humans in built environments. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 6 (2021), 6748-6765."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00864"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01797"},{"key":"e_1_3_2_1_23_1","volume-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"Ren Shaoqing","year":"2016","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2016. Faster R-CNN: Towards real-time object detection with region proposal networks. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 6 (2016), 1137-1149."},{"key":"e_1_3_2_1_24_1","volume-title":"Multi-camera Tracking. In Computer Vision - ECCV 2016 Workshops. Springer International Publishing, Cham, 17-35","author":"Ristani Ergys","year":"2016","unstructured":"Ergys Ristani, Francesco Solera, Roger Zou, Rita Cucchiara, and Carlo Tomasi. 2016. Performance Measures and a Data Set for Multi-target, Multi-camera Tracking. In Computer Vision - ECCV 2016 Workshops. Springer International Publishing, Cham, 17-35."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00340"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00138-024-01531-5"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02032"},{"key":"e_1_3_2_1_28_1","volume-title":"TransTrack: Multiple object tracking with transformer. arXiv preprint arXiv:2012.15460","author":"Sun Peize","year":"2020","unstructured":"Peize Sun, Jinkun Cao, Yi Jiang, Rufeng Zhang, Enze Xie, Zehuan Yuan, Changhu Wang, and Ping Luo. 2020. TransTrack: Multiple object tracking with transformer. arXiv preprint arXiv:2012.15460 (2020)."},{"key":"e_1_3_2_1_29_1","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008), 2579-2605.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_30_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.3389\/fenvs.2024.1408370"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_7"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2017.8296962"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680944"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2024.106539"},{"key":"e_1_3_2_1_36_1","volume-title":"TransCenter: Transformers with dense representations for multiple-object tracking","author":"Xu Yihong","year":"2022","unstructured":"Yihong Xu, Yutong Ban, Guillaume Delorme, Chuang Gan, Daniela Rus, and Xavier Alameda-Pineda. 2022. TransCenter: Transformers with dense representations for multiple-object tracking. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 6 (2022), 7820-7835."},{"key":"e_1_3_2_1_37_1","volume-title":"Bridging the gap between end-to-end and non-end-to-end multi-object tracking. arXiv preprint arXiv:2305.12724","author":"Yan Feng","year":"2023","unstructured":"Feng Yan, Weixin Luo, Yujie Zhong, Yiyang Gan, and Lin Ma. 2023. Bridging the gap between end-to-end and non-end-to-end multi-object tracking. arXiv preprint arXiv:2305.12724 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Deep learning for person re-identification: A survey and outlook","author":"Ye Mang","year":"2021","unstructured":"Mang Ye, Jianbing Shen, Gaojie Lin, Tao Xiang, Ling Shao, and Steven CH Hoi. 2021. Deep learning for person re-identification: A survey and outlook. IEEE transactions on pattern analysis and machine intelligence, Vol. 44, 6 (2021), 2872-2893."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28493"},{"key":"e_1_3_2_1_40_1","volume-title":"Motrv3: Release-fetch supervision for end-to-end multi-object tracking. arXiv preprint arXiv:2305.14298","author":"Yu En","year":"2023","unstructured":"En Yu, Tiancai Wang, Zhuoling Li, Yuang Zhang, Xiangyu Zhang, and Wenbing Tao. 2023. Motrv3: Release-fetch supervision for end-to-end multi-object tracking. arXiv preprint arXiv:2305.14298 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_38"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01513-4"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02112"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00614"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_28"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00857"},{"key":"e_1_3_2_1_48_1","volume-title":"Looking beyond two frames: End-to-end multi-object tracking using spatial and temporal transformers","author":"Zhu Tianyu","year":"2022","unstructured":"Tianyu Zhu, Markus Hiller, Mahsa Ehsanpour, Rongkai Ma, Tom Drummond, Ian Reid, and Hamid Rezatofighi. 2022. Looking beyond two frames: End-to-end multi-object tracking using spatial and temporal transformers. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 11 (2022), 12783-12797."},{"key":"e_1_3_2_1_49_1","volume-title":"Deformable DETR: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020a. Deformable DETR: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)."},{"key":"e_1_3_2_1_50_1","volume-title":"Deformable detr: Deformable transformers for end-to-end object detection. arXiv","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020b. Deformable detr: Deformable transformers for end-to-end object detection. arXiv 2020. arXiv preprint arXiv:2010.04159, Vol. 3 (2020)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755617","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:02:33Z","timestamp":1765342953000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755617"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":50,"alternative-id":["10.1145\/3746027.3755617","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755617","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}