{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:46:35Z","timestamp":1772905595254,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658108","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"570-578","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Semantic-guided RGB-Thermal Crowd Counting with Segment Anything Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-9065-968X","authenticated-orcid":false,"given":"Yaqun","family":"Fang","sequence":"first","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5460-5731","authenticated-orcid":false,"given":"Yi","family":"Shi","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3731-7294","authenticated-orcid":false,"given":"Jia","family":"Bei","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3092-424X","authenticated-orcid":false,"given":"Tongwei","family":"Ren","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00127"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3463628"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3289290"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Ricardo Guerrero-G\u00f3mez-Olmedo Beatriz Torre-Jim\u00e9nez Roberto L\u00f3pez-Sastre Saturnino Maldonado-Basc\u00f3n and Daniel Onoro-Rubio. 2015. Extremely overlapping vehicle counting. In Pattern Recognition and Image Analysis.","DOI":"10.1007\/978-3-319-19390-8_48"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9860018"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME55011.2023.00203"},{"key":"e_1_3_2_1_7_1","volume-title":"Mask-aware networks for crowd counting","author":"Jiang Shengqin","year":"2019","unstructured":"Shengqin Jiang, Xiaobo Lu, Yinjie Lei, and Lingqiao Liu. 2019. Mask-aware networks for crowd counting. IEEE Transactions on Circuits and Systems for Video Technology (2019), 3119--3129."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander C Berg Wan-Yen Lo et al. 2023. Segment anything. arXiv preprint arXiv:2304.02643 (2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.119038"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3272269"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00479"},{"key":"e_1_3_2_1_12_1","volume-title":"DENet: a universal network for counting crowd with varying densities and scales. arXiv preprint arXiv:1904.08056","author":"Liu Lei","year":"2019","unstructured":"Lei Liu, Jie Jiang, Wenjing Jia, Saeed Amirgholipour, Michelle Zeibots, and Xiangjian He. 2019. DENet: a universal network for counting crowd with varying densities and scales. arXiv preprint arXiv:1904.08056 (2019)."},{"key":"e_1_3_2_1_13_1","volume-title":"2023 b. Any-to-any style transfer: making Picasso and Da Vinci collaborate. arXiv preprint arXiv:2304.09728","author":"Liu Songhua","year":"2023","unstructured":"Songhua Liu, Jingwen Ye, and Xinchao Wang. 2023 b. Any-to-any style transfer: making Picasso and Da Vinci collaborate. arXiv preprint arXiv:2304.09728 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"2023 c. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499","author":"Liu Shilong","year":"2023","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, et al. 2023 c. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3262978"},{"key":"e_1_3_2_1_16_1","volume-title":"British Machine Vision Conference.","author":"Liu Zhengyi","year":"2022","unstructured":"Zhengyi Liu, Wei Wu, Yacheng Tan, and Guanghui Zhang. 2022. RGB-T multi-modal crowd counting based on transformer. In British Machine Vision Conference."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-024-44824-z"},{"key":"e_1_3_2_1_18_1","volume-title":"Can sam count anything? an empirical study on sam counting. arXiv preprint arXiv:2304.10817","author":"Ma Zhiheng","year":"2023","unstructured":"Zhiheng Ma, Xiaopeng Hong, and Qinnan Shangguan. 2023. Can sam count anything? an empirical study on sam counting. arXiv preprint arXiv:2304.10817 (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00169"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jag.2023.103540"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Yi Pan Wujie Zhou Xiaohong Qian Shanshan Mao Rongwang Yang and Lu Yu. 2023. CGINet: Cross-modality grade interaction network for RGB-T crowd counting. Engineering Applications of Artificial Intelligence 106885.","DOI":"10.1016\/j.engappai.2023.106885"},{"key":"e_1_3_2_1_22_1","volume-title":"Asian conference on computer vision.","author":"Peng Tao","year":"2020","unstructured":"Tao Peng, Qing Li, and Pengfei Zhu. 2020. Rgb-t crowd counting from drone: A benchmark and mmccn network. In Asian conference on computer vision."},{"key":"e_1_3_2_1_23_1","volume-title":"IEEE\/CVF Winter Conference on Applications of Computer Vision.","author":"Ren Simiao","unstructured":"Simiao Ren, Francesco Luzi, Saad Lahrichi, Kaleb Kassaw, Leslie M. Collins, Kyle Bradbury, and Jordan M. Malof. 2024 b. Segment anything, From Space?. In IEEE\/CVF Winter Conference on Applications of Computer Vision."},{"key":"e_1_3_2_1_24_1","volume-title":"2024 a. Grounded SAM: Assembling Open-World Models for Diverse Visual Tasks. arXiv preprint arXiv:2401.14159","author":"Ren Tianhe","year":"2024","unstructured":"Tianhe Ren, Shilong Liu, Ailing Zeng, Jing Lin, Kunchang Li, He Cao, Jiayu Chen, Xinyu Huang, Yukang Chen, Feng Yan, Zhaoyang Zeng, Hao Zhang, Feng Li, Jie Yang, Hongyang Li, Qing Jiang, and Lei Zhang. 2024 a. Grounded SAM: Assembling Open-World Models for Diverse Visual Tasks. arXiv preprint arXiv:2401.14159 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"IEEE International Symposium on Circuits and Systems.","author":"Tang Haihan","year":"2022","unstructured":"Haihan Tang, Yi Wang, and Lap-Pui Chau. 2022. TAFNet: A three-stream adaptive fusion network for RGB-T crowd counting. In IEEE International Symposium on Circuits and Systems."},{"key":"e_1_3_2_1_26_1","volume-title":"Distribution matching for crowd counting. Advances in neural information processing systems","author":"Wang Boyu","year":"2020","unstructured":"Boyu Wang, Huidong Liu, Dimitris Samaras, and Minh Hoai Nguyen. 2020. Distribution matching for crowd counting. Advances in neural information processing systems , Vol. 33 (2020), 1595--1607."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859777"},{"key":"e_1_3_2_1_29_1","volume-title":"Jointly modeling association and motion cues for robust infrared UAV tracking. The Visual Computer","author":"Xu Boyue","year":"2024","unstructured":"Boyue Xu, Ruichao Hou, Jia Bei, Tongwei Ren, and Gangshan Wu. 2024. Jointly modeling association and motion cues for robust infrared UAV tracking. The Visual Computer (2024), 1432--2315."},{"key":"e_1_3_2_1_30_1","volume-title":"Inpaint anything: segment anything meets image inpainting. arXiv preprint arXiv:2304.06790","author":"Yu Tao","year":"2023","unstructured":"Tao Yu, Runseng Feng, Ruoyu Feng, Jinming Liu, Xin Jin, Wenjun Zeng, and Zhibo Chen. 2023. Inpaint anything: segment anything meets image inpainting. arXiv preprint arXiv:2304.06790 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"A comprehensive survey on segment anything model for vision and beyond. arXiv preprint arXiv:2305.08196","author":"Zhang Chunhui","year":"2023","unstructured":"Chunhui Zhang, Li Liu, Yawen Cui, Guanjie Huang, Weilin Lin, Yiqian Yang, and Yuehong Hu. 2023. A comprehensive survey on segment anything model for vision and beyond. arXiv preprint arXiv:2305.08196 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Asian Conference on Computer Vision.","author":"Zhang Youjia","year":"2022","unstructured":"Youjia Zhang, Soyun Choi, and Sungeun Hong. 2022. Spatio-channel attention blocks for cross-modal crowd counting. In Asian Conference on Computer Vision."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01302"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME51207.2021.9428284"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2022.3203385"},{"key":"e_1_3_2_1_36_1","volume-title":"Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)."}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658108","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658108","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:50:41Z","timestamp":1755766241000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658108"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":36,"alternative-id":["10.1145\/3652583.3658108","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658108","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}