{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,8]],"date-time":"2025-04-08T23:40:55Z","timestamp":1744155655543,"version":"3.40.3"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031728891"},{"type":"electronic","value":"9783031728907"}],"license":[{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72890-7_1","type":"book-chapter","created":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T19:46:07Z","timestamp":1733514367000},"page":"1-18","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Class-Agnostic Object Counting with\u00a0Text-to-Image Diffusion Model"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9258-5768","authenticated-orcid":false,"given":"Xiaofei","family":"Hui","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2417-8837","authenticated-orcid":false,"given":"Qian","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1920-0371","authenticated-orcid":false,"given":"Hossein","family":"Rahmani","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4365-4165","authenticated-orcid":false,"given":"Jun","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,7]]},"reference":[{"key":"1_CR1","unstructured":"Amini-Naieni, N., Amini-Naieni, K., Han, T., Zisserman, A.: Open-world text-specified object counting. In: British Machine Vision Conference (2023)"},{"key":"1_CR2","doi-asserted-by":"publisher","unstructured":"Arteta, C., Lempitsky, V., Zisserman, A.: Counting in the Wild. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 483\u2013498. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_30","DOI":"10.1007\/978-3-319-46478-7_30"},{"key":"1_CR3","doi-asserted-by":"publisher","unstructured":"Babu\u00a0Sam, D., Agarwalla, A., Joseph, J., Sindagi, V.A., Babu, R.V., Patel, V.M.: Completely self-supervised crowd counting via distribution matching. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision, pp. 186\u2013204. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19821-2_11","DOI":"10.1007\/978-3-031-19821-2_11"},{"key":"1_CR4","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. In: Advances in Neural Information Processing Systems, vol. 33, pp. 9912\u20139924 (2020)"},{"key":"1_CR5","doi-asserted-by":"publisher","unstructured":"Chan, A.B., Liang, Z.S.J., Vasconcelos, N.: Privacy preserving crowd monitoring: counting people without people models or tracking. In: 2008 IEEE Conference on Computer Vision and Pattern Recognition, pp.\u00a01\u20137 (2008). https:\/\/doi.org\/10.1109\/CVPR.2008.4587569","DOI":"10.1109\/CVPR.2008.4587569"},{"key":"1_CR6","unstructured":"Chang, L., Yujie, Z., Andrew, Z., Weidi, X.: CounTR: transformer-based generalised visual counting. In: British Machine Vision Conference (BMVC) (2022)"},{"key":"1_CR7","doi-asserted-by":"crossref","unstructured":"Chattopadhyay, P., Vedantam, R., Selvaraju, R.R., Batra, D., Parikh, D.: Counting everyday objects in everyday scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1135\u20131144 (2017)","DOI":"10.1109\/CVPR.2017.471"},{"issue":"4","key":"1_CR8","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592116","volume":"42","author":"H Chefer","year":"2023","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: attention-based semantic guidance for text-to-image diffusion models. ACM Trans. Graph. (TOG) 42(4), 1\u201310 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"1_CR9","unstructured":"Foo, L.G., Rahmani, H., Liu, J.: AIGC for various data modalities: a survey. arXiv preprint arXiv:2308.14177 (2023)"},{"key":"1_CR10","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=NAQvF08TcyG"},{"key":"1_CR11","doi-asserted-by":"publisher","unstructured":"Gong, S., Zhang, S., Yang, J., Dai, D., Schiele, B.: Class-agnostic object counting robust to intraclass diversity. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision, pp. 388\u2013403. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19827-4_23","DOI":"10.1007\/978-3-031-19827-4_23"},{"key":"1_CR12","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-or, D.: Prompt-to-prompt image editing with cross-attention control. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=_CDixzkzeyb"},{"key":"1_CR13","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"1_CR14","unstructured":"Hobley, M., Prisacariu, V.: Learning to count anything: reference-less class-agnostic counting with weak supervision. arXiv preprint arXiv:2205.10203 (2022)"},{"key":"1_CR15","doi-asserted-by":"crossref","unstructured":"Hsieh, M.R., Lin, Y.L., Hsu, W.H.: Drone-based object counting by spatially regularized regional proposal networks. In: The IEEE International Conference on Computer Vision (ICCV). IEEE (2017)","DOI":"10.1109\/ICCV.2017.446"},{"key":"1_CR16","doi-asserted-by":"publisher","unstructured":"Jiang, R., Liu, L., Chen, C.: CLIP-count: towards text-guided zero-shot object counting. In: MM \u201923, Proceedings of the 31st ACM International Conference on Multimedia, pp. 4535\u20134545. Association for Computing Machinery, New York, NY, USA (2023). https:\/\/doi.org\/10.1145\/3581783.3611789","DOI":"10.1145\/3581783.3611789"},{"key":"1_CR17","unstructured":"Khani, A., Asgari, S., Sanghi, A., Amiri, A.M., Hamarneh, G.: SLime: segment like me. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=7FeIRqCedv"},{"key":"1_CR18","doi-asserted-by":"crossref","unstructured":"Laradji, I.H., Rostamzadeh, N., Pinheiro, P.O., Vazquez, D., Schmidt, M.: Where are the blobs: counting by localization with point supervision. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 547\u2013562 (2018)","DOI":"10.1007\/978-3-030-01216-8_34"},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"Liang, D., Xie, J., Zou, Z., Ye, X., Xu, W., Bai, X.: CrowdCLIP: unsupervised crowd counting via vision-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2893\u20132903 (2023)","DOI":"10.1109\/CVPR52729.2023.00283"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Lin, W., Chan, A.B.: Optimal transport minimization: crowd localization on density maps for semi-supervised counting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 21663\u201321673 (2023)","DOI":"10.1109\/CVPR52729.2023.02075"},{"key":"1_CR21","unstructured":"Lin, W., et al.: Scale-prior deformable convolution for exemplar-guided class-agnostic counting. In: 33rd British Machine Vision Conference 2022, BMVC 2022, London, UK, November 21\u201324, 2022. BMVA Press (2022). https:\/\/bmvc2022.mpi-inf.mpg.de\/0313.pdf"},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Liu, W., Salzmann, M., Fua, P.: Context-aware crowd counting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5099\u20135108 (2019)","DOI":"10.1109\/CVPR.2019.00524"},{"key":"1_CR23","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"1_CR24","doi-asserted-by":"publisher","unstructured":"Lu, E., Xie, W., Zisserman, A.: Class-agnostic counting. In: Jawahar, C.V., Li, H., Mori, G., Schindler, K. (eds.) ACCV 2018. LNCS, vol. 11363, pp. 669\u2013684. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-20893-6_42","DOI":"10.1007\/978-3-030-20893-6_42"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Michel, A., Gross, W., Schenkel, F., Middelmann, W.: Class-aware object counting. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 469\u2013478 (2022)","DOI":"10.1109\/WACVW54805.2022.00053"},{"key":"1_CR26","doi-asserted-by":"publisher","unstructured":"Mundhenk, T.N., Konjevod, G., Sakla, W.A., Boakye, K.: A large contextual dataset for classification, detection and counting of cars with deep learning. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 785\u2013800. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_48","DOI":"10.1007\/978-3-319-46487-9_48"},{"key":"1_CR27","doi-asserted-by":"publisher","unstructured":"Nguyen, T., Pham, C., Nguyen, K., Hoai, M.: Few-shot object counting and detection. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision, pp. 348\u2013365. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20044-1_20","DOI":"10.1007\/978-3-031-20044-1_20"},{"issue":"25","key":"1_CR28","doi-asserted-by":"publisher","first-page":"E5716","DOI":"10.1073\/pnas.1719367115","volume":"115","author":"MS Norouzzadeh","year":"2018","unstructured":"Norouzzadeh, M.S., et al.: Automatically identifying, counting, and describing wild animals in camera-trap images with deep learning. Proc. Natl. Acad. Sci. 115(25), E5716\u2013E5725 (2018)","journal-title":"Proc. Natl. Acad. Sci."},{"key":"1_CR29","doi-asserted-by":"publisher","unstructured":"Peng, D., Zhang, Z., Hu, P., Ke, Q., Yau, D., Liu, J.: Harnessing text-to-image diffusion models for category-agnostic pose estimation. In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) European Conference on Computer Vision. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-72624-8_20","DOI":"10.1007\/978-3-031-72624-8_20"},{"key":"1_CR30","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"1_CR31","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"1_CR32","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"key":"1_CR33","unstructured":"Ranasinghe, Y., Nair, N.G., Bandara, W.G.C., Patel, V.M.: Diffuse-denoise-count: accurate crowd-counting with diffusion models. arXiv preprint arXiv:2303.12790 (2023)"},{"key":"1_CR34","unstructured":"Ranjan, V., Nguyen, M.H.: Exemplar free class agnostic counting. In: Proceedings of the Asian Conference on Computer Vision, pp. 3121\u20133137 (2022)"},{"key":"1_CR35","doi-asserted-by":"crossref","unstructured":"Ranjan, V., Sharma, U., Nguyen, T., Hoai, M.: Learning to count everything. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3394\u20133403 (2021)","DOI":"10.1109\/CVPR46437.2021.00340"},{"key":"1_CR36","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"1_CR37","doi-asserted-by":"publisher","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"1_CR38","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"1_CR39","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Advances in Neural Information Processing Systems, vol. 35, pp. 36479\u201336494 (2022)"},{"key":"1_CR40","doi-asserted-by":"crossref","unstructured":"Shi, M., Lu, H., Feng, C., Liu, C., Cao, Z.: Represent, compare, and learn: a similarity-aware framework for class-agnostic counting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9529\u20139538 (2022)","DOI":"10.1109\/CVPR52688.2022.00931"},{"key":"1_CR41","doi-asserted-by":"crossref","unstructured":"Song, Q., et al.: Rethinking counting and localization in crowds: a purely point-based framework. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00335"},{"key":"1_CR42","doi-asserted-by":"crossref","unstructured":"Sundararaman, R., De\u00a0Almeida\u00a0Braga, C., Marchand, E., Pettre, J.: Tracking pedestrian heads in dense crowd. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3865\u20133875 (2021)","DOI":"10.1109\/CVPR46437.2021.00386"},{"key":"1_CR43","doi-asserted-by":"crossref","unstructured":"\u0110uki\u0107, N., Luke\u017ei\u010d, A., Zavrtanik, V., Kristan, M.: A low-shot object counting network with iterative prototype adaptation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 18872\u201318881 (2023)","DOI":"10.1109\/ICCV51070.2023.01730"},{"key":"1_CR44","doi-asserted-by":"crossref","unstructured":"Wang, M., Li, Y., Zhou, J., Taylor, G.W., Gong, M.: GCNet: probing self-similarity learning for generalized counting network. arXiv preprint arXiv:2302.05132 (2023)","DOI":"10.1016\/j.patcog.2024.110513"},{"key":"1_CR45","doi-asserted-by":"crossref","unstructured":"Xu, J., Le, H., Nguyen, V., Ranjan, V., Samaras, D.: Zero-shot object counting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15548\u201315557 (2023)","DOI":"10.1109\/CVPR52729.2023.01492"},{"key":"1_CR46","doi-asserted-by":"crossref","unstructured":"Yang, S.D., Su, H.T., Hsu, W.H., Chen, W.C.: Class-agnostic few-shot object counting. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 870\u2013878 (2021)","DOI":"10.1109\/WACV48630.2021.00091"},{"key":"1_CR47","doi-asserted-by":"crossref","unstructured":"You, Z., Yang, K., Luo, W., Lu, X., Cui, L., Le, X.: Few-shot object counting with similarity-aware feature enhancement. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 6315\u20136324 (2023)","DOI":"10.1109\/WACV56688.2023.00625"},{"key":"1_CR48","unstructured":"Zenglin\u00a0Shi, Ying\u00a0Sun, M.Z.: Training-free object counting with prompts. In: WACV (2024)"},{"key":"1_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"1_CR50","doi-asserted-by":"publisher","unstructured":"Zhang, Z., Xu, L., Peng, D., Rahmani, H., Liu, J.: Diff-tracker: text-to-image diffusion models are unsupervised trackers. In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) European Conference on Computer Vision. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-73390-1_19","DOI":"10.1007\/978-3-031-73390-1_19"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72890-7_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T20:02:47Z","timestamp":1733515367000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72890-7_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,7]]},"ISBN":["9783031728891","9783031728907"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72890-7_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,7]]},"assertion":[{"value":"7 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}