{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T00:54:13Z","timestamp":1775868853924,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The Hong Kong Polytechnic University","award":["ZVVK-P0036744"],"award-info":[{"award-number":["ZVVK-P0036744"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611789","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"4535-4545","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":64,"title":["CLIP-Count: Towards Text-Guided Zero-Shot Object Counting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8666-6767","authenticated-orcid":false,"given":"Ruixiang","family":"Jiang","sequence":"first","affiliation":[{"name":"The Hong Kong Polytechnic University, HKSAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8179-6685","authenticated-orcid":false,"given":"Lingbo","family":"Liu","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, HKSAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6720-234X","authenticated-orcid":false,"given":"Changwen","family":"Chen","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, HKSAR, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_2_1","unstructured":"Stanislas Dehaene. 2011. The number sense: How the mind creates mathematics. OUP USA."},{"key":"e_1_3_2_1_3_1","volume-title":"A Low-Shot Object Counting Network With Iterative Prototype Adaptation. arXiv preprint arXiv:2211.08217","author":"Djukic Nikola","year":"2022","unstructured":"Nikola Djukic, Alan Lukezic, Vitjan Zavrtanik, and Matej Kristan. 2022. A Low-Shot Object Counting Network With Iterative Prototype Adaptation. arXiv preprint arXiv:2211.08217 (2022)."},{"key":"e_1_3_2_1_4_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3083--3092","author":"Han Tao","year":"2022","unstructured":"Tao Han, Lei Bai, Junyu Gao, Qi Wang, and Wanli Ouyang. 2022. Dr. vic: Decomposition and reasoning for video individual counting. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3083--3092."},{"key":"e_1_3_2_1_6_1","volume-title":"Learning to Count Anything: Reference-less Class-agnostic Counting with Weak Supervision. arXiv preprint arXiv:2205.10203","author":"Hobley Michael","year":"2022","unstructured":"Michael Hobley and Victor Prisacariu. 2022. Learning to Count Anything: Reference-less Class-agnostic Counting with Weak Supervision. arXiv preprint arXiv:2205.10203 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"AvatarCLIP: Zero-Shot Text-Driven Generation and Animation of 3D Avatars. arXiv preprint arXiv:2205.08535","author":"Hong Fangzhou","year":"2022","unstructured":"Fangzhou Hong, Mingyuan Zhang, Liang Pan, Zhongang Cai, Lei Yang, and Ziwei Liu. 2022. AvatarCLIP: Zero-Shot Text-Driven Generation and Animation of 3D Avatars. arXiv preprint arXiv:2205.08535 (2022)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.446"},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_10_1","volume-title":"Visual prompt tuning. arXiv preprint arXiv:2203.12119","author":"Jia Menglin","year":"2022","unstructured":"Menglin Jia, Luming Tang, Bor-Chun Chen, Claire Cardie, Serge Belongie, Bharath Hariharan, and Ser-Nam Lim. 2022. Visual prompt tuning. arXiv preprint arXiv:2203.12119 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3205210"},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_13_1","volume-title":"Tel Aviv","author":"Li Yanghao","year":"2022","unstructured":"Yanghao Li, Hanzi Mao, Ross Girshick, and Kaiming He. 2022c. Exploring plain vision transformer backbones for object detection. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part IX. Springer, 280--296."},{"key":"e_1_3_2_1_14_1","volume-title":"Clip surgery for better explainability with enhancement in open-vocabulary tasks. arXiv preprint arXiv:2304.05653","author":"Li Yi","year":"2023","unstructured":"Yi Li, Hualiang Wang, Yiqun Duan, and Xiaomeng Li. 2023. Clip surgery for better explainability with enhancement in open-vocabulary tasks. arXiv preprint arXiv:2304.05653 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"CrowdCLIP: Unsupervised Crowd Counting via Vision-Language Model. arXiv preprint arXiv:2304.04231","author":"Liang Dingkang","year":"2023","unstructured":"Dingkang Liang, Jiahao Xie, Zhikang Zou, Xiaoqing Ye, Wei Xu, and Xiang Bai. 2023. CrowdCLIP: Unsupervised Crowd Counting via Vision-Language Model. arXiv preprint arXiv:2304.04231 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Object Counting: You Only Need to Look at One. arXiv preprint arXiv:2112.05993","author":"Lin Hui","year":"2021","unstructured":"Hui Lin, Xiaopeng Hong, and Yabin Wang. 2021. Object Counting: You Only Need to Look at One. arXiv preprint arXiv:2112.05993 (2021)."},{"key":"e_1_3_2_1_17_1","volume-title":"GridCLIP: One-Stage Object Detection by Grid-Level CLIP Representation Learning. arXiv preprint arXiv:2303.09252","author":"Lin Jiayi","year":"2023","unstructured":"Jiayi Lin and Shaogang Gong. 2023. GridCLIP: One-Stage Object Detection by Grid-Level CLIP Representation Learning. arXiv preprint arXiv:2303.09252 (2023)."},{"key":"e_1_3_2_1_18_1","unstructured":"Wei Lin Kunlin Yang Xinzhu Ma Junyu Gao Lingbo Liu Shinan Liu Jun Hou Shuai Yi and Antoni B Chan. 2022. Scale-Prior Deformable Convolution for Exemplar-Guided Class-Agnostic Counting. (2022)."},{"key":"e_1_3_2_1_19_1","volume-title":"Countr: Transformer-based generalised visual counting. arXiv preprint arXiv:2208.13721","author":"Liu Chang","year":"2022","unstructured":"Chang Liu, Yujie Zhong, Andrew Zisserman, and Weidi Xie. 2022. Countr: Transformer-based generalised visual counting. arXiv preprint arXiv:2208.13721 (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413938"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00479"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00186"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/3304415.3304536"},{"key":"e_1_3_2_1_24_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_25_1","volume-title":"Asian conference on computer vision. Springer, 669--684","author":"Lu Erika","year":"2018","unstructured":"Erika Lu, Weidi Xie, and Andrew Zisserman. 2018. Class-agnostic counting. In Asian conference on computer vision. Springer, 669--684."},{"key":"e_1_3_2_1_26_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_27_1","volume-title":"Teaching CLIP to Count to Ten. arXiv preprint arXiv:2302.12066","author":"Paiss Roni","year":"2023","unstructured":"Roni Paiss, Ariel Ephrat, Omer Tov, Shiran Zada, Inbar Mosseri, Michal Irani, and Tali Dekel. 2023. Teaching CLIP to Count to Ten. arXiv preprint arXiv:2302.12066 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00259"},{"key":"e_1_3_2_1_30_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the Asian Conference on Computer Vision. 3121--3137","author":"Ranjan Viresh","year":"2022","unstructured":"Viresh Ranjan and Minh Hoai Nguyen. 2022. Exemplar free class agnostic counting. In Proceedings of the Asian Conference on Computer Vision. 3121--3137."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00340"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01805"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00931"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.206"},{"key":"e_1_3_2_1_37_1","volume-title":"NeRF-Art: Text-Driven Neural Radiance Fields Stylization. arXiv preprint arXiv:2212.08070","author":"Wang Can","year":"2022","unstructured":"Can Wang, Ruixiang Jiang, Menglei Chai, Mingming He, Dongdong Chen, and Jing Liao. 2022a. NeRF-Art: Text-Driven Neural Radiance Fields Stylization. arXiv preprint arXiv:2212.08070 (2022)."},{"key":"e_1_3_2_1_38_1","volume-title":"GCNet: Probing Self-Similarity Learning for Generalized Counting Network. arXiv preprint arXiv:2302.05132","author":"Wang Mingjie","year":"2023","unstructured":"Mingjie Wang, Yande Li, Jun Zhou, Graham W Taylor, and Minglun Gong. 2023. GCNet: Probing Self-Similarity Learning for Generalized Counting Network. arXiv preprint arXiv:2302.05132 (2023)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"e_1_3_2_1_40_1","volume-title":"Multimodal Crowd Counting with Mutual Attention Transformers. In IEEE International Conference on Multimedia and Expo. IEEE, 1--6.","author":"Wu Zhengtao","year":"2022","unstructured":"Zhengtao Wu, Lingbo Liu, Yang Zhang, Mingzhi Mao, Liang Lin, and Guanbin Li. 2022. Multimodal Crowd Counting with Mutual Attention Transformers. In IEEE International Conference on Multimedia and Expo. IEEE, 1--6."},{"key":"e_1_3_2_1_41_1","volume-title":"Zero-shot Object Counting. arXiv preprint arXiv:2303.02001","author":"Xu Jingyi","year":"2023","unstructured":"Jingyi Xu, Hieu Le, Vu Nguyen, Viresh Ranjan, and Dimitris Samaras. 2023. Zero-shot Object Counting. arXiv preprint arXiv:2303.02001 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00091"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00625"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.05.042"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.396"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.70"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_51_1","volume-title":"ZegCLIP: Towards Adapting CLIP for Zero-shot Semantic Segmentation. arXiv preprint arXiv:2212.03588","author":"Zhou Ziqin","year":"2022","unstructured":"Ziqin Zhou, Bowen Zhang, Yinjie Lei, Lingqiao Liu, and Yifan Liu. 2022c. ZegCLIP: Towards Adapting CLIP for Zero-shot Semantic Segmentation. arXiv preprint arXiv:2212.03588 (2022)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611789","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611789","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:51Z","timestamp":1755820851000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611789"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":51,"alternative-id":["10.1145\/3581783.3611789","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611789","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}