{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T05:54:16Z","timestamp":1763790856032,"version":"3.45.0"},"publisher-location":"Singapore","reference-count":25,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819533459","type":"print"},{"value":"9789819533466","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T00:00:00Z","timestamp":1763856000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T00:00:00Z","timestamp":1763856000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3346-6_14","type":"book-chapter","created":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T05:50:07Z","timestamp":1763790607000},"page":"181-192","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Gradient Co-occurrence Analysis for\u00a0Detecting Unsafe Prompts in\u00a0Large Language Models"],"prefix":"10.1007","author":[{"given":"Jingyuan","family":"Yang","sequence":"first","affiliation":[]},{"given":"Bowen","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Rongjun","family":"Li","sequence":"additional","affiliation":[]},{"given":"Ziyu","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Xin","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Zhiyong","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Peng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,23]]},"reference":[{"key":"14_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"14_CR2","unstructured":"AlibabaCloud: Content moderation (2024). https:\/\/vision.aliyun.com\/imageaudit. Accessed 08 Feb 2025"},{"key":"14_CR3","doi-asserted-by":"crossref","unstructured":"Azaria, A., Mitchell, T.: The internal state of an LLM knows when it\u2019s lying. arXiv preprint arXiv:2304.13734 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.68"},{"issue":"7","key":"14_CR4","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0130140","volume":"10","author":"S Bach","year":"2015","unstructured":"Bach, S., Binder, A., Montavon, G., Klauschen, F., M\u00fcller, K.R., Samek, W.: On pixel-wise explanations for non-linear classifier decisions by layer-wise relevance propagation. PLoS ONE 10(7), e0130140 (2015)","journal-title":"PLoS ONE"},{"key":"14_CR5","unstructured":"BaiduAI: Text censoring technology (2024). https:\/\/ai.baidu.com\/tech\/textcensoring. Accessed 08 Feb 2025"},{"key":"14_CR6","doi-asserted-by":"crossref","unstructured":"Caselli, T., Basile, V., Mitrovi\u0107, J., Granitzer, M.: Hatebert: retraining bert for abusive language detection in english. In: Proceedings of the 5th Workshop on Online Abuse and Harms (WOAH 2021), pp. 17\u201325 (2021)","DOI":"10.18653\/v1\/2021.woah-1.3"},{"key":"14_CR7","unstructured":"Chi, J., et al.: Llama guard 3 vision: safeguarding human-AI image understanding conversations. arXiv preprint arXiv:2411.10414 (2024)"},{"key":"14_CR8","unstructured":"Hanu, L., Unitary team: Detoxify. Github (2020). https:\/\/github.com\/unitaryai\/detoxify"},{"key":"14_CR9","unstructured":"Inan, H., et al.: Llama guard: LLM-based input-output safeguard for human-AI conversations. arXiv preprint arXiv:2312.06674 (2023)"},{"key":"14_CR10","doi-asserted-by":"crossref","unstructured":"Lin, Z., et al.: Toxicchat: unveiling hidden challenges of toxicity detection in real-world user-AI conversation. arXiv preprint arXiv:2310.17389 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.311"},{"key":"14_CR11","unstructured":"Llama Team: The llama 3 herd of models (2024). https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"14_CR12","unstructured":"Microsoft: Azure AI content safety: detect and moderate harmful content in text and images (2024). https:\/\/azure.microsoft.com\/en-us\/products\/ai-services\/ai-content-safety. Accessed 08 Jan 2025"},{"key":"14_CR13","unstructured":"OpenAI: Moderation API: a tool for content moderation in language models (2024). https:\/\/platform.openai.com\/docs\/guides\/moderation\/. Accessed 08 Jan 2025"},{"key":"14_CR14","unstructured":"Perspective: Perspective API: a tool for toxicity detection in online content (2024). https:\/\/perspectiveapi.com\/. Accessed 08 Jan 2025"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Rebedea, T., Dinu, R., Sreedhar, M., Parisien, C., Cohen, J.: Nemo guardrails: a toolkit for controllable and safe LLM applications with programmable rails. arXiv preprint arXiv:2310.10501 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.40"},{"key":"14_CR16","doi-asserted-by":"crossref","unstructured":"R\u00f6ttger, P., Kirk, H.R., Vidgen, B., Attanasio, G., Bianchi, F., Hovy, D.: Xstest: a test suite for identifying exaggerated safety behaviours in large language models. arXiv preprint arXiv:2308.01263 (2023)","DOI":"10.18653\/v1\/2024.naacl-long.301"},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"14_CR18","unstructured":"Shrikumar, A., Greenside, P., Shcherbina, A., Kundaje, A.: Not just a black box: learning important features through propagating activation differences. arXiv preprint arXiv:1605.01713 (2016)"},{"key":"14_CR19","unstructured":"Sundararajan, M., Taly, A., Yan, Q.: Axiomatic attribution for deep networks. In: International Conference on Machine Learning, pp. 3319\u20133328. PMLR (2017)"},{"key":"14_CR20","unstructured":"Gemma Team, et al.: Gemma: open models based on Gemini research and technology. arXiv preprint arXiv:2403.08295 (2024)"},{"key":"14_CR21","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"14_CR22","doi-asserted-by":"crossref","unstructured":"Wang, W., Haddow, B., Birch, A., Peng, W.: Assessing factual reliability of large language model knowledge. In: Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pp. 805\u2013819 (2024)","DOI":"10.18653\/v1\/2024.naacl-long.46"},{"key":"14_CR23","doi-asserted-by":"crossref","unstructured":"Xie, Y., Fang, M., Pi, R., Gong, N.: Gradsafe: detecting unsafe prompts for LLMs via safety-critical gradient analysis. arXiv preprint arXiv:2402.13494 (2024)","DOI":"10.18653\/v1\/2024.acl-long.30"},{"key":"14_CR24","unstructured":"Yang, A., et al.: Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024)"},{"key":"14_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, Z., et al.: Shieldlm: empowering LLMs as aligned, customizable and explainable safety detectors. In: Findings of the Association for Computational Linguistics: EMNLP 2024, pp. 10420\u201310438 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.610"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3346-6_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T05:50:10Z","timestamp":1763790610000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3346-6_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,23]]},"ISBN":["9789819533459","9789819533466"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3346-6_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,23]]},"assertion":[{"value":"23 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}