{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,20]],"date-time":"2025-08-20T12:21:30Z","timestamp":1755692490130,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Shenzhen Science and Technology Program","award":["KJZD20230923114600002"],"award-info":[{"award-number":["KJZD20230923114600002"]}]},{"name":"NSFC","award":["62272133"],"award-info":[{"award-number":["62272133"]}]},{"name":"Key Laboratory of Industrial Equipment Quality Big Data","award":["No. 2024-IEQBD-01"],"award-info":[{"award-number":["No. 2024-IEQBD-01"]}]},{"name":"Shenzhen Colleges and Universities Stable Support Program","award":["GXWD20220811170100001"],"award-info":[{"award-number":["GXWD20220811170100001"]}]},{"name":"the Yunnan Fundamental Research Projects","award":["202301AV070004"],"award-info":[{"award-number":["202301AV070004"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681211","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"10258-10267","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Prototype-Guided Dual-Transformer Reasoning for Video Individual Counting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6658-0084","authenticated-orcid":false,"given":"Rui","family":"Li","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7924-4039","authenticated-orcid":false,"given":"Yishu","family":"Liu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2462-6174","authenticated-orcid":false,"given":"Huafeng","family":"Li","sequence":"additional","affiliation":[{"name":"Kunmimg University of Science and Technology, Kunming, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5156-0305","authenticated-orcid":false,"given":"Jinxing","family":"Li","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1578-2634","authenticated-orcid":false,"given":"Guangming","family":"Lu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.429"},{"key":"e_1_3_2_1_2_1","volume-title":"Confidence-based data association and discriminative deep appearance learning for robust online multi-object tracking","author":"Bae Seung-Hwan","year":"2017","unstructured":"Seung-Hwan Bae and Kuk-Jin Yoon. 2017. Confidence-based data association and discriminative deep appearance learning for robust online multi-object tracking. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 3 (2017), 595--610."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206648"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00165"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.2517-6161.1977.tb01600.x"},{"key":"e_1_3_2_1_7_1","volume-title":"CLRNet: a cross locality relation network for crowd counting in videos","author":"Dong Li","year":"2022","unstructured":"Li Dong, Haijun Zhang, Jianghong Ma, Xiaofei Xu, Yimin Yang, and QM Jonathan Wu. 2022. CLRNet: a cross locality relation network for crowd counting in videos. IEEE Transactions on Neural Networks and Learning Systems (2022)."},{"key":"e_1_3_2_1_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00145"},{"key":"e_1_3_2_1_10_1","volume-title":"U 2-Former: Nested U-shaped Transformer for Image Restoration via Multi-view Contrastive Learning","author":"Feng Xin","year":"2023","unstructured":"Xin Feng, Haobo Ji, Wenjie Pei, Jinxing Li, Guangming Lu, and David Zhang. 2023. U 2-Former: Nested U-shaped Transformer for Image Restoration via Multi-view Contrastive Learning. IEEE Transactions on Circuits and Systems for Video Technology (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Feature-aware adaptation and density alignment for crowd counting in video surveillance","author":"Gao Junyu","year":"2020","unstructured":"Junyu Gao, Yuan Yuan, and Qi Wang. 2020. Feature-aware adaptation and density alignment for crowd counting in video surveillance. IEEE transactions on cybernetics, Vol. 51, 10 (2020), 4822--4833."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3083--3092","author":"Han Tao","year":"2022","unstructured":"Tao Han, Lei Bai, Junyu Gao, Qi Wang, and Wanli Ouyang. 2022. Dr. vic: Decomposition and reasoning for video individual counting. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3083--3092."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01565"},{"key":"e_1_3_2_1_15_1","volume-title":"Counting Crowds in Bad Weather. arXiv preprint arXiv:2306.01209","author":"Huang Zhi-Kai","year":"2023","unstructured":"Zhi-Kai Huang, Wei-Ting Chen, Yuan-Chun Chiang, Sy-Yen Kuo, and Ming-Hsuan Yang. 2023. Counting Crowds in Bad Weather. arXiv preprint arXiv:2306.01209 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_17_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, Vol. 25 (2012)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00823"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3205210"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00120"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-021-3445-y"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19769-7_3"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2803518"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01901"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 19228--19237","author":"Liu Xinyan","key":"e_1_3_2_1_26_1","unstructured":"Xinyan Liu, Guorong Li, Yuankai Qi, Ziheng Yan, Zhenjun Han, Anton van den Hengel, Ming-Hsuan Yang, and Qingming Huang. 2024. Weakly Supervised Video Individual Counting. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 19228--19237."},{"key":"e_1_3_2_1_27_1","volume-title":"Guanlin Chen, Yu Wang, Qinghua Hu, and Pengfei Zhu.","author":"Liu Zhihao","year":"2023","unstructured":"Zhihao Liu, Yuanyuan Shang, Timing Li, Guanlin Chen, Yu Wang, Qinghua Hu, and Pengfei Zhu. 2023. Robust multi-drone multi-target tracking to resolve target occlusion: A benchmark. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2015.2489418"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00624"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3044219"},{"key":"e_1_3_2_1_31_1","volume-title":"Prototypical networks for few-shot learning. Advances in neural information processing systems","author":"Snell Jake","year":"2017","unstructured":"Jake Snell, Kevin Swersky, and Richard Zemel. 2017. Prototypical networks for few-shot learning. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2019.2911128"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00386"},{"key":"e_1_3_2_1_35_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-06381-7_11"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00416"},{"key":"e_1_3_2_1_38_1","volume-title":"Kap Luk Chan, and Li Wang","author":"Wang Bing","year":"2016","unstructured":"Bing Wang, Gang Wang, Kap Luk Chan, and Li Wang. 2016. Tracklet association by online target-specific metric learning and coherent dynamics estimation. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 3 (2016), 589--602."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00929"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.04.071"},{"key":"e_1_3_2_1_41_1","unstructured":"Chunlong Xia Xinliang Wang Feng Lv Xin Hao and Yifeng Shi. 2024. ViT-CoMer: Vision Transformer with Convolutional Multi-scale Feature Interaction for Dense Predictions. (2024) 1--10."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.14711\/thesis-991012554769303412"},{"key":"e_1_3_2_1_43_1","first-page":"21969","article-title":"Attribute prototype network for zero-shot learning","volume":"33","author":"Xu Wenjia","year":"2020","unstructured":"Wenjia Xu, Yongqin Xian, Jiuniu Wang, Bernt Schiele, and Zeynep Akata. 2020. Attribute prototype network for zero-shot learning. Advances in Neural Information Processing Systems, Vol. 33 (2020), 21969--21980.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00411"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3268446"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01513-4"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.70"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings, Part VIII 14","author":"Zhao Zhuoyi","year":"2016","unstructured":"Zhuoyi Zhao, Hongsheng Li, Rui Zhao, and Xiaogang Wang. 2016. Crossing-line crowd counting with two-phase deep neural networks. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part VIII 14. Springer, 712--726."},{"key":"e_1_3_2_1_49_1","volume-title":"Cross-line pedestrian counting based on spatially-consistent two-stage local crowd density estimation and accumulation","author":"Zheng Huicheng","year":"2018","unstructured":"Huicheng Zheng, Zijian Lin, Jiepeng Cen, Zeyu Wu, and Yadan Zhao. 2018. Cross-line pedestrian counting based on spatially-consistent two-stage local crowd density estimation and accumulation. IEEE transactions on circuits and systems for video technology, Vol. 29, 3 (2018), 787--799."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00261"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3082297"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475377"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681211","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681211","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681211"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":53,"alternative-id":["10.1145\/3664647.3681211","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681211","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}