{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T23:43:32Z","timestamp":1769471012126,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819555666","type":"print"},{"value":"9789819555673","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5567-3_2","type":"book-chapter","created":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T13:42:10Z","timestamp":1769434930000},"page":"18-32","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["FoCLIP: A Feature-Space Misalignment Framework for\u00a0CLIP-Based Image Manipulation and\u00a0Detection"],"prefix":"10.1007","author":[{"given":"Yulin","family":"Chen","sequence":"first","affiliation":[]},{"given":"Zeyuan","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Tianyuan","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Yingmei","family":"Wei","sequence":"additional","affiliation":[]},{"given":"Liang","family":"Bai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,23]]},"reference":[{"key":"2_CR1","doi-asserted-by":"crossref","unstructured":"Lin, W., Mei, J., Chen, J., et al.: PreFLMR: scaling up fine-grained late-interaction multi-modal retrievers. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, ACL (2024)","DOI":"10.18653\/v1\/2024.acl-long.289"},{"key":"2_CR2","unstructured":"Yu, D., Zhang, X., Chen, Y., et al.: Recent Advances of Multimodal Continual Learning: a Comprehensive Survey. arXiv preprint arXiv:2410.05352 (2024)"},{"key":"2_CR3","unstructured":"Radford, A., Kim, J.W., Hallacy, C., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of the 38th International Conference on Machine Learning, PMLR, pp. 8748\u20138763 (2021)"},{"key":"2_CR4","doi-asserted-by":"crossref","unstructured":"Xu, Z.X., Tang, F.L., Chen, Z., et al.: Toward modality gap: vision prototype learning for weakly-supervised semantic segmentation with CLIP. In: Proceedings of the AAAI Conference on Artificial Intelligence, AAAI (2025)","DOI":"10.1609\/aaai.v39i9.32976"},{"key":"2_CR5","unstructured":"Xie, S.N., et al.: Enhancing scientific consistency in text-to-image synthesis via CLIP-based reward models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR (2025)"},{"key":"2_CR6","unstructured":"Ramesh, A., Pavlov, M., Goh, G., et al.: Zero-shot text-to-image generation. In: Proceedings of the 38th International Conference on Machine Learning, PMLR, pp. 8821\u20138831 (2021)"},{"key":"2_CR7","unstructured":"Yang, A., Pan, J., Lin, J., et al.: Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese. arXiv preprint arXiv:2211.01335 (2022)"},{"key":"2_CR8","unstructured":"Jina AI.: Jina CLIP: Your CLIP Model Is Also Your Text Retriever. arXiv preprint arXiv:2405.20204 (2024)"},{"key":"2_CR9","unstructured":"Bai, J., Li, Y., Jiang, Y., et al.: BadCLIP: dual-embedding guided backdoor attacks on multimodal contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, IEEE, pp. 15834\u201315843 (2024)"},{"issue":"5","key":"2_CR10","doi-asserted-by":"crossref","first-page":"1021","DOI":"10.1007\/s11390-021-1153-y","volume":"38","author":"T Chen","year":"2023","unstructured":"Chen, T., Zhang, H., Wang, L., et al.: Resonance attack: revealing the vulnerability of cross-modal models CLIP. J. Comput. Sci. Technol. 38(5), 1021\u20131036 (2023)","journal-title":"J. Comput. Sci. Technol."},{"key":"2_CR11","unstructured":"Han, X., Liu, R., Zhou, K., et al.: Detect-CLIP-backdoor-samples: efficient detection of natural backdoors in pre-trained models. In: International Conference on Learning Representations, ICLR (2025)"},{"key":"2_CR12","unstructured":"Li, Y., Wang, Q., Zhang, S., et al.: BDetCLIP: Test-Time Backdoor Detection for Multimodal Models. arXiv preprint arXiv:2403.05621 (2024)"},{"key":"2_CR13","unstructured":"Liang, W.X., Zhang, Y.H., Kwon, Y.C., et al.: Mind the dining. In: Advances in Neural Information Processing Systems, NeurIPS, pp. 17612\u201317625 (2022)"},{"key":"2_CR14","unstructured":"Freiberger, M., Kun, P., Igel, C., et al.: Fooling contrastive language-image pre-trained models with CLIPMasterPrints. In: 2024 Transactions on Machine Learning Research, TMLR, pp. 2307\u201303798 (2024)"},{"key":"2_CR15","unstructured":"Yang, Z., Zhang, Y., Chen, H., et al.: SafeBench: A Safety Evaluation Framework for Multimodal Large Language Models. arXiv preprint arXiv:2410.18927 (2024)"},{"key":"2_CR16","unstructured":"Zhu, J., Li, J., Wang, X., et al.: MultiTrust: a comprehensive benchmark for multimodal model trustworthiness. In: Proceedings of the AAAI Conference on Artificial Intelligence, AAAI, pp. 10245\u201310256 (2025)"},{"key":"2_CR17","unstructured":"Jiang, Z., Li, J., Wu, Z., et al.: Multimodal AI: Image Generation Capabilities and Safety Challenges. Technical Report, The University of Hong Kong, Hong Kong (2025)"},{"key":"2_CR18","unstructured":"Chen, H., Zhang, Y., Dong, Y., et al.: How does vision-language adaptation impact the safety of vision language models? In: International Conference on Learning Representations, ICLR (2025)"},{"key":"2_CR19","unstructured":"Qiu, J.L., Zhu, Y., Shi, X.J., et al.: Are Multimodal Models Robust to Image and Text Perturbations? arXiv preprint arXiv:2212.08044 (2022)"},{"key":"2_CR20","unstructured":"Noever, D.A., Miller Noever, S.E.: Reading Isn\u2019t Believing: Adversarial Attacks on Multi-Modal Neurons. arXiv preprint arXiv:2103.10480 (2021)"},{"key":"2_CR21","unstructured":"Daras, G., Dimakis, A.G.: Discovering the hidden vocabulary of DALLE-2. In: NeurIPS 2022 Workshop on Score-Based Methods (2023)"},{"issue":"3","key":"2_CR22","doi-asserted-by":"publisher","DOI":"10.23915\/distill.00030","volume":"6","author":"G Goh","year":"2021","unstructured":"Goh, G., Cammarata, N., Voss, C., et al.: Multimodal neurons in artificial neural networks. Distill 6(3), e30 (2021)","journal-title":"Distill"},{"key":"2_CR23","doi-asserted-by":"crossref","unstructured":"Dong, Y., Liao, F., Pang, T., et al.: Boosting adversarial attacks with momentum. In: 2018 IEEE Conference on Computer Vision and Pattern Recognition, IEEE, pp. 9185\u20139193 (2018)","DOI":"10.1109\/CVPR.2018.00957"},{"key":"2_CR24","unstructured":"Qin, Z., Frosio, I., Chen, Y., et al.: Black-box adversarial attacks with limited queries and information. In: Proceedings of the 35th International Conference on Machine Learning, PMLR, pp. 2137\u20132146 (2018)"},{"issue":"1","key":"2_CR25","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1109\/TIFS.2008.2012215","volume":"4","author":"H Farid","year":"2009","unstructured":"Farid, H.: Exposing digital forgeries ghosts. IEEE Trans. Inf. Forensics Secur. 4(1), 154\u2013160 (2009)","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"key":"2_CR26","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Goh, J., Loshchilov, I., et al.: Learning rich features for image manipulation detection. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, IEEE, pp. 1053\u20131061 (2018)","DOI":"10.1109\/CVPR.2018.00116"},{"key":"2_CR27","unstructured":"Xu, W., Qi, Y., Evans, D., et al.: Detecting adversarial attacks via neural fingerprint. In: Proceedings of the 35th AAAI Conference on Artificial Intelligence, AAAI Press, pp. 10485\u201310493 (2021)"},{"key":"2_CR28","unstructured":"Chen, T., Kornblith, S., Norouzi, M., et al.: A simple framework for contrastive learning of visual representations. In: Proceedings of the 37th International Conference on Machine Learning, PMLR, pp. 1597\u20131607 (2020)"},{"key":"2_CR29","unstructured":"Wang, W., Yang, X., Hertzmann, A., et al.: Cross-modal self-attention network for image-text matching. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, IEEE, pp. 10976\u201310985 (2019)"},{"key":"2_CR30","unstructured":"Jia, C., Yang, C., Xia, Y., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Proceedings of the 38th International Conference on Machine Learning, PMLR, pp. 4904\u20134916 (2021)"},{"key":"2_CR31","unstructured":"Liang, Y., Wu, C., Song, T., et al.: Multi-task multimodal prompt tuning for vision-language models. In: Advances in Neural Information Processing Systems 35, NeurIPS, pp. 27933\u201327944 (2022)"},{"key":"2_CR32","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., et al.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, IEEE, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5567-3_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T13:42:16Z","timestamp":1769434936000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5567-3_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819555666","9789819555673"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5567-3_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"23 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}