{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:21:45Z","timestamp":1778080905354,"version":"3.51.4"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031736674","type":"print"},{"value":"9783031736681","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73668-1_20","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T02:02:30Z","timestamp":1733018550000},"page":"340-356","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Safe-CLIP: Removing NSFW Concepts from\u00a0Vision-and-Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8428-501X","authenticated-orcid":false,"given":"Samuele","family":"Poppi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0573-8209","authenticated-orcid":false,"given":"Tobia","family":"Poppi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1396-9114","authenticated-orcid":false,"given":"Federico","family":"Cocchi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9640-9385","authenticated-orcid":false,"given":"Marcella","family":"Cornia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5125-4957","authenticated-orcid":false,"given":"Lorenzo","family":"Baraldi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2239-283X","authenticated-orcid":false,"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,1]]},"reference":[{"key":"20_CR1","unstructured":"Bakker, M., et\u00a0al.: Fine-tuning language models to find agreement among humans with diverse preferences. In: NeurIPS (2022)"},{"key":"20_CR2","unstructured":"Bedapudi, P.: NudeNet: neural nets for nudity classification, detection, and selective censoring (2019)"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Birhane, A., Prabhu, V.U.: Large image datasets: a pyrrhic win for computer vision? In: WACV (2021)","DOI":"10.1109\/WACV48630.2021.00158"},{"key":"20_CR4","unstructured":"Birhane, A., Prabhu, V.U., Kahembwe, E.: Multimodal datasets: misogyny, pornography, and malignant stereotypes. arXiv preprint arXiv:2110.01963 (2021)"},{"key":"20_CR5","doi-asserted-by":"crossref","unstructured":"Caffagni, D., et al.: The revolution of multimodal large language models: a survey. In: ACL Findings (2024)","DOI":"10.18653\/v1\/2024.findings-acl.807"},{"key":"20_CR6","doi-asserted-by":"crossref","unstructured":"Cao, Y., Yang, J.: Towards making systems forget with machine unlearning. In: IEEE Symposium on Security and Privacy (2015)","DOI":"10.1109\/SP.2015.35"},{"key":"20_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/j.datak.2022.101979","volume":"138","author":"F Cauteruccio","year":"2022","unstructured":"Cauteruccio, F., Corradini, E., Terracina, G., Ursino, D., Virgili, L.: Extraction and analysis of text patterns from nsfw adult content in reddit. Data Knowl. Eng. 138, 101979 (2022)","journal-title":"Data Knowl. Eng."},{"key":"20_CR8","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023)"},{"key":"20_CR9","unstructured":"Christiano, P.F., Leike, J., Brown, T., Martic, M., Legg, S., Amodei, D.: Deep reinforcement learning from human preferences. In: NeurIPS (2017)"},{"issue":"1","key":"20_CR10","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0190954","volume":"13","author":"DL Crone","year":"2018","unstructured":"Crone, D.L., Bode, S., Murawski, C., Laham, S.M.: The Socio-Moral Image Database (SMID): a novel stimulus set for the study of social, moral and affective processes. PLoS ONE 13(1), e0190954 (2018)","journal-title":"PLoS ONE"},{"key":"20_CR11","unstructured":"Dettmers, T., Pagnoni, A., Holtzman, A., Zettlemoyer, L.: QLoRA: efficient finetuning of quantized LLMs. arXiv preprint arXiv:2305.14314 (2023)"},{"key":"20_CR12","unstructured":"Gadre, S.Y., et\u00a0al.: DataComp: in search of the next generation of multimodal datasets. In: NeurIPS (2024)"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Gandhi, S., et al.: scalable detection of offensive and non-compliant content\/logo in product images. In: WACV (2020)","DOI":"10.1109\/WACV45572.2020.9093454"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Gandikota, R., Materzynska, J., Fiotto-Kaufman, J., Bau, D.: Erasing concepts from diffusion models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00230"},{"key":"20_CR15","unstructured":"Gao, P., et al.: LLaMA-adapter V2: parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)"},{"key":"20_CR16","unstructured":"Ginart, A., Guan, M., Valiant, G., Zou, J.Y.: Making AI forget you: data deletion in machine learning. In: NeurIPS (2019)"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Golatkar, A., Achille, A., Soatto, S.: Eternal sunshine of the spotless net: selective forgetting in deep networks. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00932"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Golatkar, A., Achille, A., Wang, Y.X., Roth, A., Kearns, M., Soatto, S.: Mixed differential privacy in computer vision. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00819"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Hidayatullah, A.F., Hakim, A.M., Sembada, A.A.: Adult content classification on indonesian tweets using LSTM neural network. In: ICACSIS (2019)","DOI":"10.1109\/ICACSIS47736.2019.8979982"},{"key":"20_CR20","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"20_CR22","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Wang, S.Y., Shechtman, E., Zhang, R., Zhu, J.Y.: Ablating concepts in text-to-image diffusion models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.02074"},{"key":"20_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"20_CR25","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"20_CR26","unstructured":"Liu, Y., Singh, A., Freeman, C.D., Co-Reyes, J.D., Liu, P.J.: Improving large language model fine-tuning for solving math problems. arXiv preprint arXiv:2310.10047 (2023)"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Markov, T., et al.: A holistic approach to undesired content detection in the real world. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i12.26752"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Materzy\u0144ska, J., Torralba, A., Bau, D.: Disentangling visual and written concepts in CLIP. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01592"},{"key":"20_CR29","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"20_CR30","unstructured":"Oord, A.V.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"20_CR31","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. In: NeurIPS (2022)"},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Poppi, S., Sarto, S., Cornia, M., Baraldi, L., Cucchiara, R.: Multi-class unlearning for image classification via weight filtering. IEEE Intell. Syst. (2024)","DOI":"10.1109\/MIS.2024.3412742"},{"key":"20_CR33","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"issue":"8","key":"20_CR34","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"20_CR35","unstructured":"Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., Finn, C.: Direct preference optimization: your language model is secretly a reward model. In: NeurIPS (2023)"},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"20_CR37","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Schramowski, P., Brack, M., Deiseroth, B., Kersting, K.: Safe latent diffusion: mitigating inappropriate degeneration in diffusion models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02157"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Schramowski, P., Tauchmann, C., Kersting, K.: Can machines help us answering question 16 in datasheets, and in turn reflecting on inappropriate content? In: ACM FAccT (2022)","DOI":"10.1145\/3531146.3533192"},{"key":"20_CR40","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. In: NeurIPS (2022)"},{"key":"20_CR41","unstructured":"Schuhmann, C., et al.: LAION-400M: open dataset of CLIP-filtered 400 million image-text pairs. In: NeurIPS Workshops (2021)"},{"key":"20_CR42","unstructured":"Shen, S., et al.: How much can CLIP benefit vision-and-language tasks? In: ICLR (2022)"},{"key":"20_CR43","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"20_CR44","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"20_CR45","doi-asserted-by":"crossref","unstructured":"Trager, M., Perera, P., Zancato, L., Achille, A., Bhatia, P., Soatto, S.: Linear spaces of meanings: compositional structures in vision-language models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01412"},{"key":"20_CR46","unstructured":"Tunstall, L., et\u00a0al.: Zephyr: direct distillation of LM alignment. arXiv preprint arXiv:2310.16944 (2023)"},{"key":"20_CR47","unstructured":"Wang, M., Xing, J., Liu, Y.: ActionCLIP: a new paradigm for video action recognition. arXiv preprint arXiv:2109.08472 (2021)"},{"key":"20_CR48","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Self-instruct: aligning language models with self-generated instructions. arXiv preprint arXiv:2212.10560 (2022)","DOI":"10.18653\/v1\/2023.acl-long.754"},{"key":"20_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, E., Wang, K., Xu, X., Wang, Z., Shi, H.: Forget-me-not: learning to forget in text-to-image diffusion models. arXiv preprint arXiv:2303.17591 (2023)","DOI":"10.1109\/CVPRW63382.2024.00182"},{"key":"20_CR50","unstructured":"Zhang, R., et al.: LLaMA-adapter: efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)"},{"key":"20_CR51","unstructured":"Zheng, L., et\u00a0al.: Judging llm-as-a-judge with mt-bench and chatbot arena. arXiv preprint arXiv:2306.05685 (2023)"},{"key":"20_CR52","unstructured":"Zhou, C., et\u00a0al.: LIMA: less is more for alignment. arXiv preprint arXiv:2305.11206 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73668-1_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T02:11:12Z","timestamp":1733019072000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73668-1_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,1]]},"ISBN":["9783031736674","9783031736681"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73668-1_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,1]]},"assertion":[{"value":"1 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}