{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:52:58Z","timestamp":1777654378153,"version":"3.51.4"},"publisher-location":"Cham","reference-count":62,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729454","type":"print"},{"value":"9783031729461","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72946-1_8","type":"book-chapter","created":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T19:02:08Z","timestamp":1727809328000},"page":"128-145","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Three Things We Need to\u00a0Know About Transferring Stable Diffusion to\u00a0Visual Dense Prediction Tasks"],"prefix":"10.1007","author":[{"given":"Manyuan","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guanglu","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoyu","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,2]]},"reference":[{"key":"8_CR1","unstructured":"Abstreiter, K., Mittal, S., Bauer, S., Sch\u00f6lkopf, B., Mehrjou, A.: Diffusion-based representation learning. arXiv preprint arXiv:2105.14257 (2021)"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Anciukevi\u010dius, T., et al.: Renderdiffusion: image diffusion for 3d reconstruction, inpainting and generation. arXiv preprint arXiv:2211.09869 (2022)","DOI":"10.1109\/CVPR52729.2023.01213"},{"key":"8_CR3","unstructured":"Baker, B., Gupta, O., Naik, N., Raskar, R.: Designing neural network architectures using reinforcement learning. arXiv preprint arXiv:1611.02167 (2016)"},{"key":"8_CR4","unstructured":"Baranchuk, D., Rubachev, I., Voynov, A., Khrulkov, V., Babenko, A.: Label-efficient semantic segmentation with diffusion models. arXiv preprint arXiv:2112.03126 (2021)"},{"key":"8_CR5","unstructured":"Bergstra, J., Bengio, Y.: Random search for hyper-parameter optimization. J. Mach. Learn. Res. 13(2) (2012)"},{"key":"8_CR6","unstructured":"Bhat, S.F., Birkl, R., Wofk, D., Wonka, P., M\u00fcller, M.: Zoedepth: zero-shot transfer by combining relative and metric depth. arXiv preprint arXiv:2302.12288 (2023)"},{"key":"8_CR7","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR8","unstructured":"Chen, Z., et al.: Vision transformer adapter for dense predictions. arXiv preprint arXiv:2205.08534 (2022)"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"8_CR10","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"issue":"1","key":"8_CR11","first-page":"1997","volume":"20","author":"T Elsken","year":"2019","unstructured":"Elsken, T., Metzen, J.H., Hutter, F.: Neural architecture search: a survey. J. Mach. Learn. Res. 20(1), 1997\u20132017 (2019)","journal-title":"J. Mach. Learn. Res."},{"key":"8_CR12","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The PASCAL visual object classes challenge 2012 (VOC2012) Results. http:\/\/www.pascal-network.org\/challenges\/VOC\/voc2012\/workshop\/index.html"},{"key":"8_CR13","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)"},{"key":"8_CR14","unstructured":"Ho, J., et\u00a0al.: Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)"},{"key":"8_CR15","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. NeurIPS 33, 6840\u20136851 (2020)","journal-title":"NeurIPS"},{"key":"8_CR16","unstructured":"Houlsby, N., et al.: Parameter-efficient transfer learning for NLP. In: International Conference on Machine Learning, pp. 2790\u20132799. PMLR (2019)"},{"key":"8_CR17","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"8_CR18","unstructured":"Karras, T., Aittala, M., Aila, T., Laine, S.: Elucidating the design space of diffusion-based generative models. arXiv preprint arXiv:2206.00364 (2022)"},{"key":"8_CR19","doi-asserted-by":"crossref","unstructured":"Kim, G., Kwon, T., Ye, J.C.: Diffusionclip: text-guided diffusion models for robust image manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2426\u20132435 (2022)","DOI":"10.1109\/CVPR52688.2022.00246"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., Doll\u00e1r, P.: Panoptic feature pyramid networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6399\u20136408 (2019)","DOI":"10.1109\/CVPR.2019.00656"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Kondapaneni, N., Marks, M., Knott, M., Guimaraes, R., Perona, P.: Text-image alignment for diffusion-based perception. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13883\u201313893 (2024)","DOI":"10.1109\/CVPR52733.2024.01317"},{"key":"8_CR22","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"8_CR23","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"8_CR25","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"8_CR26","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: P-tuning v2: prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602 (2021)","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"8_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Z., et\u00a0al.: Swin transformer v2: scaling up capacity and resolution. In: CVPR, pp. 12009\u201312019 (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"8_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: ICCV, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"8_CR31","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"8_CR32","unstructured":"Luo, G., Dunlap, L., Park, D.H., Holynski, A., Darrell, T.: Diffusion hyperfeatures: searching through time and space for semantic correspondence. In:Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"8_CR33","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2i-adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"8_CR34","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning, pp. 8162\u20138171. PMLR (2021)"},{"key":"8_CR35","doi-asserted-by":"crossref","unstructured":"Ning, J., et al.: All in tokens: unifying output space of visual tasks via soft token. arXiv preprint arXiv:2301.02229 (2023)","DOI":"10.1109\/ICCV51070.2023.01822"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"Patil, V., Sakaridis, C., Liniger, A., Van\u00a0Gool, L.: P3depth: monocular depth estimation with a piecewise planarity prior. In: CVPR, pp. 1610\u20131621 (2022)","DOI":"10.1109\/CVPR52688.2022.00166"},{"key":"8_CR37","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763. PMLR (2021)"},{"key":"8_CR38","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"8_CR39","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: ICML, pp. 8821\u20138831. PMLR (2021)"},{"key":"8_CR40","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: Denseclip: language-guided dense prediction with context-aware prompting. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: fine tuning text-to-image diffusion models for subject-driven generation. arXiv preprint arXiv:2208.12242 (2022)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"8_CR43","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487 (2022)"},{"key":"8_CR44","unstructured":"Schuhmann, C., et\u00a0al.: Laion-5b: an open large-scale dataset for training next generation image-text models. arXiv preprint arXiv:2210.08402 (2022)"},{"key":"8_CR45","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)"},{"key":"8_CR46","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"8_CR47","unstructured":"Shuai, Z., Chen, Y., Mao, S., Zho, Y., Zhang, X.: Diffseg: a segmentation model for skin lesions based on diffusion difference. arXiv preprint arXiv:2404.16474 (2024)"},{"key":"8_CR48","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1007\/978-3-642-33715-4_54","volume-title":"Computer Vision \u2013 ECCV 2012","author":"N Silberman","year":"2012","unstructured":"Silberman, N., Hoiem, D., Kohli, P., Fergus, R.: indoor segmentation and support inference from RGBD images. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7576, pp. 746\u2013760. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33715-4_54"},{"key":"8_CR49","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv:2010.02502 (2020). https:\/\/arxiv.org\/abs\/2010.02502"},{"key":"8_CR50","doi-asserted-by":"crossref","unstructured":"Tan, M., et al.: Mnasnet: platform-aware neural architecture search for mobile. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2820\u20132828 (2019)","DOI":"10.1109\/CVPR.2019.00293"},{"key":"8_CR51","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: Beit pretraining for all vision and vision-language tasks. arXiv preprint arXiv:2208.10442 (2022)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"8_CR52","unstructured":"Watson, D., Chan, W., Martin-Brualla, R., Ho, J., Tagliasacchi, A., Norouzi, M.: Novel view synthesis with diffusion models. arXiv preprint arXiv:2210.04628 (2022)"},{"key":"8_CR53","doi-asserted-by":"crossref","unstructured":"Wu, W., Zhao, Y., Shou, M.Z., Zhou, H., Shen, C.: Diffumask: synthesizing images with pixel-level annotations for semantic segmentation using diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1206\u20131217 (2023)","DOI":"10.1109\/ICCV51070.2023.00117"},{"key":"8_CR54","doi-asserted-by":"crossref","unstructured":"Yang, X., Wang, X.: Diffusion model as representation learner. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 18938\u201318949 (2023)","DOI":"10.1109\/ICCV51070.2023.01736"},{"key":"8_CR55","doi-asserted-by":"crossref","unstructured":"Yuan, W., Gu, X., Dai, Z., Zhu, S., Tan, P.: New CRFs: neural window fully-connected CRFs for monocular depth estimation. arXiv preprint arXiv:2203.01502 (2022)","DOI":"10.1109\/CVPR52688.2022.00389"},{"key":"8_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, L., Agrawala, M.: Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"8_CR57","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., Jia, J.: Pyramid scene parsing network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2881\u20132890 (2017)","DOI":"10.1109\/CVPR.2017.660"},{"key":"8_CR58","doi-asserted-by":"crossref","unstructured":"Zhao, W., Rao, Y., Liu, Z., Liu, B., Zhou, J., Lu, J.: Unleashing text-to-image diffusion models for visual perception. arXiv preprint arXiv:2303.02153 (2023)","DOI":"10.1109\/ICCV51070.2023.00527"},{"key":"8_CR59","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ade20k dataset. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"8_CR60","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Tulsiani, S.: Sparsefusion: distilling view-conditioned diffusion for 3d reconstruction. arXiv preprint arXiv:2212.00792 (2022)","DOI":"10.1109\/CVPR52729.2023.01211"},{"key":"8_CR61","unstructured":"Zimmermann, R.S., Schott, L., Song, Y., Dunn, B.A., Klindt, D.A.: Score-based generative classifiers. arXiv preprint arXiv:2110.00473 (2021)"},{"key":"8_CR62","unstructured":"Zoph, B., Le, Q.V.: Neural architecture search with reinforcement learning. arXiv preprint arXiv:1611.01578 (2016)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72946-1_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T19:03:51Z","timestamp":1727809431000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72946-1_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,2]]},"ISBN":["9783031729454","9783031729461"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72946-1_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,2]]},"assertion":[{"value":"2 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}