{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T16:09:41Z","timestamp":1781194181962,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Natural Science Foundation of Anhui Province","award":["No. 2208085J18"],"award-info":[{"award-number":["No. 2208085J18"]}]},{"name":"the Natural Science Foundation of China","award":["No. 62376004"],"award-info":[{"award-number":["No. 62376004"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680878","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"9291-9300","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Breaking Modality Gap in RGBT Tracking: Coupled Knowledge Distillation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0902-2260","authenticated-orcid":false,"given":"Andong","family":"Lu","sequence":"first","affiliation":[{"name":"Information Materials and Intelligent Sensing Laboratory of Anhui Province, Anhui Provincial Key Laboratory of Multimodal Cognitive Computation, School of Computer Science and Technology, Anhui University, HeFei, AnHui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8480-3988","authenticated-orcid":false,"given":"Jiacong","family":"Zhao","sequence":"additional","affiliation":[{"name":"Information Materials and Intelligent Sensing Laboratory of Anhui Province, Anhui Provincial Key Laboratory of Multimodal Cognitive Computation, School of Artificial Intelligence, Anhui University, HeFei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7233-2739","authenticated-orcid":false,"given":"Chenglong","family":"Li","sequence":"additional","affiliation":[{"name":"Information Materials and Intelligent Sensing Laboratory of Anhui Province, Anhui Provincial Key Laboratory of Multimodal Cognitive Computation, School of Artificial Intelligence, Anhui University, HeFei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5285-8565","authenticated-orcid":false,"given":"Yun","family":"Xiao","sequence":"additional","affiliation":[{"name":"Anhui Provincial Key Laboratory of Multimodal Cognitive Computation, School of Artificial Intelligence, Anhui University, HeFei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5948-5055","authenticated-orcid":false,"given":"Bin","family":"Luo","sequence":"additional","affiliation":[{"name":"Anhui Provincial Key Laboratory of Multimodal Cognitive Computation, School of Computer Science and Technology, Anhui University, HeFei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence.","author":"Cao Bing","year":"2024","unstructured":"Bing Cao, Junliang Guo, Pengfei Zhu, and Qinghua Hu. 2024. Bi-directional Adapter for Multi-modal Tracking. In Proceedings of the AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_2_1","volume-title":"A short note on the kinetics-700 human action dataset. arXiv preprint arXiv:1907.06987","author":"Carreira Joao","year":"2019","unstructured":"Joao Carreira, Eric Noland, Chloe Hillier, and Andrew Zisserman. 2019. A short note on the kinetics-700 human action dataset. arXiv preprint arXiv:1907.06987 (2019)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i8.16865"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_7"},{"key":"e_1_3_2_1_5_1","volume-title":"Fusion Tree Network for RGBT Tracking. In IEEE International Conference on Advanced Video and Signal Based Surveillance. 1--8.","author":"Cheng Zhiyuan","year":"2022","unstructured":"Zhiyuan Cheng, Andong Lu, Zhang Zhang, Chenglong Li, and Liang Wang. 2022. Fusion Tree Network for RGBT Tracking. In IEEE International Conference on Advanced Video and Signal Based Surveillance. 1--8."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3147974"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01281"},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-023-00003-0"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_11_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition.","author":"Hong Lingyi","year":"2024","unstructured":"Lingyi Hong, Shilin Yan, Renrui Zhang, Wanyun Li, Xinyu Zhou, Pinxue Guo, Kaixun Jiang, Yiting Chen, Jinglun Li, Zhaoyu Chen, et al. 2024. OneTracker: Unifying Visual Object Tracking with Foundation Models and Efficient Tuning.. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_13_1","volume-title":"MIRNet: A Robust RGBT Tracking Jointly with Multi-Modal Interaction and Refinement. In 2022 IEEE International Conference on Multimedia and Expo (ICME). 1--6.","author":"Hou Ruichao","year":"2022","unstructured":"Ruichao Hou, Tongwei Ren, and Gangshan Wu. 2022. MIRNet: A Robust RGBT Tracking Jointly with Multi-Modal Interaction and Refinement. In 2022 IEEE International Conference on Multimedia and Expo (ICME). 1--6."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition.","author":"Hou Xiaojun","year":"2024","unstructured":"Xiaojun Hou, Jiazheng Xing, Yijie Qian, Yaowei Guo, Shuo Xin, Junhao Chen, Kai Tang, Mengmeng Wang, Zhengkai Jiang, Liang Liu, et al. 2024. SDSTrack: Self-Distillation Symmetric Adapter Learning for Multi-Modal Visual Object Tracking. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_15_1","volume-title":"Normalization techniques in training dnns: Methodology, analysis and application","author":"Huang Lei","year":"2023","unstructured":"Lei Huang, Jie Qin, Yi Zhou, Fan Zhu, Li Liu, and Ling Shao. 2023. Normalization techniques in training dnns: Methodology, analysis and application. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.167"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01310"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00235"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2614135"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.106977"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58542-6_14"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00279"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3130533"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123289"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00745"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612341"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3087341"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3157594"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.3390\/s23146609"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2022.3229830"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition.","author":"Pengyu Zhang","year":"2022","unstructured":"Zhang Pengyu, Jie Zhao, Dong Wang, Huchuan Lu, and Xiang Ruan. 2022. Visible-Thermal UAV Tracking: A Large-Scale Benchmark and New Baseline. In Proceedings of the IEEE conference on computer vision and pattern recognition."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611977172.69"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01312"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i13.29407"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3127492"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01140"},{"key":"e_1_3_2_1_37_1","volume-title":"Exploring fusion strategies for accurate RGBT visual object tracking. Information Fusion","author":"Tang Zhangyong","year":"2023","unstructured":"Zhangyong Tang, Tianyang Xu, Hui Li, Xiao-Jun Wu, Xuefeng Zhu, and Josef Kittler. 2023. Exploring fusion strategies for accurate RGBT visual object tracking. Information Fusion (2023), 101881."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00709"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28352"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3375505"},{"key":"e_1_3_2_1_41_1","volume-title":"MFGNet: Dynamic modality-aware filter generation for RGB-T tracking","author":"Wang Xiao","year":"2022","unstructured":"Xiao Wang, Xiujun Shu, Shilliang Zhang, Bo Jiang, Yaowei Wang, Yonghong Tian, and Feng Wu. 2022. MFGNet: Dynamic modality-aware filter generation for RGB-T tracking. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20165"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01399"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition.","author":"Wu Zongwei","year":"2024","unstructured":"Zongwei Wu, Jilai Zheng, Xiangxuan Ren, Florin-Alexandru Vasluianu, Chao Ma, Danda Pani Paudel, Luc Van Gool, and Radu Timofte. 2024. Single-Model and Any-Modality for Video Object Tracking. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20187"},{"key":"e_1_3_2_1_46_1","volume-title":"Multimodal learning with transformers: A survey","author":"Xu Peng","year":"2023","unstructured":"Peng Xu, Xiatian Zhu, and David A Clifton. 2023. Multimodal learning with transformers: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547851"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00065"},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the IEEE International Conference on Computer Vision Workshops.","author":"Zhang Lichao","unstructured":"Lichao Zhang, Martin Danelljan, Abel Gonzalez-Garcia, Joost van de Weijer, and Fahad Shahbaz Khan. 2019. Multi-Modal Fusion for End-to-End RGB-T Tracking. In Proceedings of the IEEE International Conference on Computer Vision Workshops."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01495-3"},{"key":"e_1_3_2_1_52_1","volume-title":"Jointly Modeling Motion and Appearance Cues for Robust RGB-T Tracking","author":"Zhang Pengyu","year":"2020","unstructured":"Pengyu Zhang, Jie Zhao, Dong Wang, Huchuan Lu, and Xiaoyun Yang. 2020. Jointly Modeling Motion and Appearance Cues for Robust RGB-T Tracking. IEEE Transactions on Image Processing (2020)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00523"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00656"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01341"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-023-00032-9"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16467"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00918"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00914"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25500"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350928"},{"key":"e_1_3_2_1_62_1","volume-title":"Quality-aware Feature Aggregation Network for Robust RGBT Tracking","author":"Zhu Yabin","year":"2020","unstructured":"Yabin Zhu, Chenglong Li, Jin Tang, and Bin Luo. 2020. Quality-aware Feature Aggregation Network for Robust RGBT Tracking. IEEE Transactions on Intelligent Vehicles (2020)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02015"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680878","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680878","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:08Z","timestamp":1750295888000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680878"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":63,"alternative-id":["10.1145\/3664647.3680878","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680878","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}