{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,23]],"date-time":"2025-12-23T10:43:55Z","timestamp":1766486635883,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62433003, 62476017"],"award-info":[{"award-number":["62433003, 62476017"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,6]]},"DOI":"10.1145\/3709026.3709063","type":"proceedings-article","created":{"date-parts":[[2025,2,15]],"date-time":"2025-02-15T10:05:41Z","timestamp":1739613941000},"page":"119-126","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["MamTrack: Vision-Language Tracking with Mamba Fusion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8446-8396","authenticated-orcid":false,"given":"Donghua","family":"Chen","sequence":"first","affiliation":[{"name":"Image Processing Center, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1282-3755","authenticated-orcid":false,"given":"Hong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Image Processing Center, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5847-9630","authenticated-orcid":false,"given":"Jianbo","family":"Song","sequence":"additional","affiliation":[{"name":"Image Processing Center, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5891-8674","authenticated-orcid":false,"given":"Yachun","family":"Feng","sequence":"additional","affiliation":[{"name":"School of Mechanical Engineering &amp; Automation, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4237-5874","authenticated-orcid":false,"given":"Yifan","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,2,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-48881-3_56"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00628"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00680"},{"key":"e_1_3_3_1_5_2","unstructured":"Tianxiang Chen Zhentao Tan Tao Gong Qi Chu Yue Wu Bin Liu Jieping Ye and Nenghai\u00a0Yu Mim-istd. 2024. Mamba-in-mamba for efficient infrared small target detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.02148 (2024)."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60639-8_34"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Karthik Dinesh and Sumana Gupta. 2014. Video stabilization camera motion pattern recognition and motion tracking using spatiotemporal regularity flow. Journal of Image and Graphics 2 1 (2014) 33\u201340.","DOI":"10.12720\/joig.2.1.33-40"},{"key":"e_1_3_3_1_9_2","unstructured":"Wenhao Dong Haodong Zhu Shaohui Lin Xiaoyan Luo Yunhang Shen Xuhui Liu Juan Zhang Guodong Guo and Baochang Zhang. 2024. Fusion-mamba for cross-modality object detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.09146 (2024)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00552"},{"key":"e_1_3_3_1_11_2","unstructured":"Qi Feng Vitaly Ablavsky Qinxun Bai and Stan Sclaroff. 2019. Robust visual object tracking with natural language region proposal network. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1912.02048 1 7 (2019) 8."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00579"},{"key":"e_1_3_3_1_13_2","unstructured":"Albert Gu and Tri Dao. 2023. Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.00752 (2023)."},{"key":"e_1_3_3_1_14_2","unstructured":"Albert Gu Karan Goel and Christopher R\u00e9. 2021. Efficiently modeling long sequences with structured state spaces. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.00396 (2021)."},{"key":"e_1_3_3_1_15_2","unstructured":"Mingzhe Guo Zhipeng Zhang Heng Fan and Liping Jing. 2022. Divert more attention to vision-language tracking. Advances in Neural Information Processing Systems 35 (2022) 4446\u20134460."},{"key":"e_1_3_3_1_16_2","unstructured":"Mingzhe Guo Zhipeng Zhang Heng Fan Liping Jing Yilin Lyu Bing Li and Weiming Hu. 2022. Learning target-aware representation for visual tracking via informative interactions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2201.02526 (2022)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33765-9_50"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Jo\u00e3o\u00a0F Henriques Rui Caseiro Pedro Martins and Jorge Batista. 2014. High-speed tracking with kernelized correlation filters. IEEE transactions on pattern analysis and machine intelligence 37 3 (2014) 583\u2013596.","DOI":"10.1109\/TPAMI.2014.2345390"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00441"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00915"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00540"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.777"},{"key":"e_1_3_3_1_24_2","unstructured":"Wei-Tung Lin Yong-Xiang Lin Jyun-Wei Chen and Kai-Lung Hua. 2024. PixMamba: Leveraging State Space Models in a Dual-Level Architecture for Underwater Image Enhancement. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.08444 (2024)."},{"key":"e_1_3_3_1_25_2","unstructured":"Yue Liu Yunjie Tian Yuzhong Zhao Hongtian Yu Lingxi Xie Yaowei Wang Qixiang Ye and Yunfan Liu. 2024. VMamba: Visual State Space Model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.10166 (2024)."},{"key":"e_1_3_3_1_26_2","unstructured":"Wenjie Luo Yujia Li Raquel Urtasun and Richard Zemel. 2016. Understanding the effective receptive field in deep convolutional neural networks. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28205"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Mathew Monfort Alex Andonian Bolei Zhou Kandan Ramakrishnan Sarah\u00a0Adel Bargal Tom Yan Lisa Brown Quanfu Fan Dan Gutfreund Carl Vondrick et\u00a0al. 2019. Moments in time dataset: one million videos for event understanding. IEEE transactions on pattern analysis and machine intelligence 42 2 (2019) 502\u2013508.","DOI":"10.1109\/TPAMI.2019.2901464"},{"key":"e_1_3_3_1_29_2","unstructured":"Siran Peng Xiangyu Zhu Haoyu Deng Zhen Lei and Liang-Jian Deng. 2024. Fusionmamba: Efficient image fusion with state space model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.07932 (2024)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_3_1_31_2","first-page":"2980","volume-title":"proceedings of the IEEE conference on computer vision and pattern recognition","author":"Ross T-YLPG","year":"2017","unstructured":"T-YLPG Ross and GKHP Doll\u00e1r. 2017. Focal loss for dense object detection. In proceedings of the IEEE conference on computer vision and pattern recognition. 2980\u20132988."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"crossref","unstructured":"Jiacheng Ruan and Suncheng Xiang. 2024. Vm-unet: Vision mamba unet for medical image segmentation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.02491 (2024).","DOI":"10.1109\/BIBM62325.2024.10821761"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"crossref","unstructured":"Chin-Shiuh Shieh Yong-Shixa Jhan Yuan-Li Liu Mong-Fong Horng and Tsair-Fwu Lee. 2018. Video object tracking with heuristic optimization methods. Journal of Image and Graphics 6 2 (2018) 95\u201399.","DOI":"10.18178\/joig.6.2.95-99"},{"key":"e_1_3_3_1_34_2","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_3_1_35_2","unstructured":"Xiao Wang Chenglong Li Rui Yang Tianzhu Zhang Jin Tang and Bin Luo. 2018. Describe and attend to track: Learning natural language guided structural representation and visual attention for object tracking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1811.10014 (2018)."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Xinyu Xie Yawen Cui Chio-In Ieong Tao Tan Xiaozhi Zhang Xubin Zheng and Zitong Yu. 2024. Fusionmamba: Dynamic feature enhancement for multimodal image fusion with mamba. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.09498 (2024).","DOI":"10.1007\/s44267-024-00072-9"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00928"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Zhengyuan Yang Tushar Kumar Tianlang Chen Jingsong Su and Jiebo Luo. 2020. Grounding-tracking-integration. IEEE Transactions on Circuits and Systems for Video Technology 31 9 (2020) 3433\u20133443.","DOI":"10.1109\/TCSVT.2020.3038720"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Saad\u00a0A Yaseen and Sreela Sasi. 2014. Robust algorithm for object detection and tracking in a dynamic scene. Journal of Image and Graphics 2 1 (2014) 41\u201345.","DOI":"10.12720\/joig.2.1.41-45"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Haojie Zhao Xiao Wang Dong Wang Huchuan Lu and Xiang Ruan. 2023. Transformer vision-language tracking via proxy token guided cross-modal fusion. Pattern Recognition Letters 168 (2023) 10\u201316.","DOI":"10.1016\/j.patrec.2023.02.023"},{"key":"e_1_3_3_1_43_2","unstructured":"Yaozong Zheng Bineng Zhong Qihua Liang Guorong Li Rongrong Ji and Xianxian Li. 2023. Towards unified token learning for vision-language tracking. IEEE Transactions on Circuits and Systems for Video Technology (2023)."},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02217"},{"key":"e_1_3_3_1_45_2","unstructured":"Lianghui Zhu Bencheng Liao Qian Zhang Xinlong Wang Wenyu Liu and Xinggang Wang. 2024. Vision mamba: Efficient visual representation learning with bidirectional state space model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.09417 (2024)."}],"event":{"name":"CSAI 2024: 2024 8th International Conference on Computer Science and Artificial Intelligence (CSAI)","acronym":"CSAI 2024","location":"Beijing China"},"container-title":["Proceedings of the 2024 8th International Conference on Computer Science and Artificial Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3709026.3709063","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3709026.3709063","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:31Z","timestamp":1750295851000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3709026.3709063"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,6]]},"references-count":44,"alternative-id":["10.1145\/3709026.3709063","10.1145\/3709026"],"URL":"https:\/\/doi.org\/10.1145\/3709026.3709063","relation":{},"subject":[],"published":{"date-parts":[[2024,12,6]]},"assertion":[{"value":"2025-02-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}