{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T04:10:34Z","timestamp":1743826234501,"version":"3.40.3"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031887079","type":"print"},{"value":"9783031887086","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88708-6_24","type":"book-chapter","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T11:52:16Z","timestamp":1743767536000},"page":"372-388","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Malevolence Attacks Against Pretrained Dialogue Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2964-6422","authenticated-orcid":false,"given":"Pengjie","family":"Ren","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8464-6162","authenticated-orcid":false,"given":"Ruiqi","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9076-6565","authenticated-orcid":false,"given":"Zhaochun","family":"Ren","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4592-4074","authenticated-orcid":false,"given":"Zhumin","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1086-0202","authenticated-orcid":false,"given":"Maarten","family":"de Rijke","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2663-5041","authenticated-orcid":false,"given":"Yangjun","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,3]]},"reference":[{"issue":"6","key":"24_CR1","doi-asserted-by":"crossref","first-page":"461","DOI":"10.1038\/s42256-021-00359-2","volume":"3","author":"A Abid","year":"2021","unstructured":"Abid, A., Farooqi, M., Zou, J.: Large language models associate muslims with violence. Nat. Mach. Intell. 3(6), 461\u2013463 (2021)","journal-title":"Nat. Mach. Intell."},{"key":"24_CR2","doi-asserted-by":"crossref","unstructured":"Azzopardi, L., et al.: Report on the Search Futures workshop at ECIR 2024. SIGIR Forum 58(1) (2024)","DOI":"10.1145\/3687273.3687288"},{"key":"24_CR3","unstructured":"Belinkov, Y., Bisk, Y.: Synthetic and natural noise both break neural machine translation. In: International Conference on Learning Representations (2018)"},{"key":"24_CR4","doi-asserted-by":"crossref","unstructured":"Bertsch, A., Oh, A., Natu, S., Gangu, S., Black, A.W., Strubell, E.: Evaluating gender bias transfer from film data. In: Proceedings of the 4th Workshop on Gender Bias in Natural Language Processing (GeBNLP), pp. 235\u2013243 (2022)","DOI":"10.18653\/v1\/2022.gebnlp-1.24"},{"key":"24_CR5","doi-asserted-by":"crossref","unstructured":"Biju, E., Sriram, A., Kumar, P., Khapra, M.M.: Input-specific attention subnetworks for adversarial detection. In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 31\u201344 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.4"},{"key":"24_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, M., Yi, J., Chen, P.Y., Zhang, H., Hsieh, C.J.: Seq2sick: evaluating the robustness of sequence-to-sequence models with adversarial examples. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 3601\u20133608 (2020)","DOI":"10.1609\/aaai.v34i04.5767"},{"key":"24_CR7","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"24_CR8","doi-asserted-by":"crossref","unstructured":"Co, K.T., Mu\u00f1oz-Gonz\u00e1lez, L., de\u00a0Maupeou, S., Lupu, E.C.: Procedural noise adversarial examples for black-box attacks on deep convolutional networks. In: Proceedings of the 2019 ACM SIGSAC Conference on Computer and Communications Security, pp. 275\u2013289 (2019)","DOI":"10.1145\/3319535.3345660"},{"key":"24_CR9","doi-asserted-by":"crossref","unstructured":"Davidson, T., Warmsley, D., Macy, M., Weber, I.: Automated hate speech detection and the problem of offensive language. In: Proceedings of the International AAAI Conference on Web and Social Media, vol.\u00a011, pp. 512\u2013515 (2017)","DOI":"10.1609\/icwsm.v11i1.14955"},{"key":"24_CR10","unstructured":"Finn, C., Levine, S., Abbeel, P.: Guided cost learning: deep inverse optimal control via policy optimization. In: Proceedings of the 33nd International Conference on Machine Learning, ICML 2016, New York City, NY, USA, 19\u201324 June 2016, pp. 49\u201358 (2016)"},{"key":"24_CR11","doi-asserted-by":"crossref","unstructured":"Gao, J., Xiong, C., Bennett, P., Craswell, N.: Neural Approaches to Conversational Information Retrieval. Springer (2023)","DOI":"10.1007\/978-3-031-23080-6"},{"key":"24_CR12","doi-asserted-by":"crossref","unstructured":"Gehman, S., Gururangan, S., Sap, M., Choi, Y., Smith, N.A.: Realtoxicityprompts: evaluating neural toxic degeneration in language models. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 3356\u20133369 (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.301"},{"key":"24_CR13","doi-asserted-by":"crossref","unstructured":"Grzes, M., Kudenko, D.: Theoretical and empirical analysis of reward shaping in reinforcement learning. In: 2009 International Conference on Machine Learning and Applications, pp. 337\u2013344 (2009)","DOI":"10.1109\/ICMLA.2009.33"},{"key":"24_CR14","doi-asserted-by":"crossref","unstructured":"Gu, Y., et al.: EVA2.0: investigating open-domain Chinese dialogue systems with large-scale pre-training. Mach. Intell. Res. 1\u201313 (2023)","DOI":"10.1007\/s11633-022-1387-3"},{"key":"24_CR15","unstructured":"Henderson, P., Islam, R., Bachman, P., Pineau, J., Precup, D., Meger, D.: Deep reinforcement learning that matters. In: McIlraith, S.A., Weinberger, K.Q. (eds.) Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence, (AAAI-18), the 30th Innovative Applications of Artificial Intelligence (IAAI-18), and the 8th AAAI Symposium on Educational Advances in Artificial Intelligence (EAAI-18), New Orleans, Louisiana, USA, 2\u20137 February 2018. AAAI Press (2018)"},{"key":"24_CR16","unstructured":"Huang, Z., Zhang, T.: Black-box adversarial attack with transferable model-based embedding. In: International Conference on Learning Representations (2019)"},{"key":"24_CR17","doi-asserted-by":"crossref","unstructured":"Iyyer, M., Wieting, J., Gimpel, K., Zettlemoyer, L.: Adversarial example generation with syntactically controlled paraphrase networks. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers), pp. 1875\u20131885 (2018)","DOI":"10.18653\/v1\/N18-1170"},{"key":"24_CR18","doi-asserted-by":"crossref","unstructured":"Jia, R., Liang, P.: Adversarial examples for evaluating reading comprehension systems. In: Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing, pp. 2021\u20132031 (2017)","DOI":"10.18653\/v1\/D17-1215"},{"key":"24_CR19","doi-asserted-by":"crossref","unstructured":"Li, L., Ma, R., Guo, Q., Xue, X., Qiu, X.: Bert-attack: adversarial attack against bert using bert. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 6193\u20136202 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.500"},{"key":"24_CR20","doi-asserted-by":"crossref","unstructured":"Li, M., Yang, Y., Wei, K., Yang, X., Huang, H.: Learning universal adversarial perturbation by adversarial example. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 1350\u20131358 (2022)","DOI":"10.1609\/aaai.v36i2.20023"},{"key":"24_CR21","doi-asserted-by":"crossref","unstructured":"Li, Z., Kiseleva, J., de\u00a0Rijke, M.: Dialogue generation: from imitation learning to inverse reinforcement learning. In: The Thirty-Third AAAI Conference on Artificial Intelligence, AAAI 2019, pp. 6722\u20136729 (2019)","DOI":"10.1609\/aaai.v33i01.33016722"},{"key":"24_CR22","unstructured":"Liu, H., Wang, Z., Derr, T., Tang, J.: Chat as expected: learning to manipulate black-box neural dialogue models. arXiv preprint arXiv:2005.13170 (2020)"},{"issue":"1","key":"24_CR23","doi-asserted-by":"crossref","first-page":"123","DOI":"10.1109\/COMST.2021.3136132","volume":"24","author":"J Liu","year":"2021","unstructured":"Liu, J., Nogueira, M., Fernandes, J., Kantarci, B.: Adversarial machine learning: a multilayer review of the state-of-the-art and challenges for wireless and mobile systems. IEEE Commun. Surv. Tutor. 24(1), 123\u2013159 (2021)","journal-title":"IEEE Commun. Surv. Tutor."},{"key":"24_CR24","doi-asserted-by":"crossref","first-page":"98","DOI":"10.1109\/TASLP.2021.3130970","volume":"30","author":"S Liu","year":"2021","unstructured":"Liu, S., Lu, N., Chen, C., Tang, K.: Efficient combinatorial optimization for word-level adversarial textual attack. IEEE\/ACM Trans. Audio Speech Lang. Process. 30, 98\u2013111 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"24_CR25","doi-asserted-by":"crossref","unstructured":"Liu, Y.A., Zhang, R., Guo, J., de\u00a0Rijke, M., Fan, Y., Cheng, X.: Multi-granular adversarial attacks against black-box neural ranking models. In: Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1391\u20131400. Association for Computing Machinery (2024)","DOI":"10.1145\/3626772.3657704"},{"key":"24_CR26","doi-asserted-by":"crossref","unstructured":"Liu, Z., Ren, P., Chen, Z., Ren, Z., de\u00a0Rijke, M., Zhou, M.: Learning to ask conversational questions by optimizing levenshtein distance. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 5638\u20135650 (2021)","DOI":"10.18653\/v1\/2021.acl-long.438"},{"key":"24_CR27","doi-asserted-by":"crossref","unstructured":"Niu, T., Bansal, M.: Adversarial over-sensitivity and over-stability strategies for dialogue models. In: CoNLL 2018, p. 486 (2018)","DOI":"10.18653\/v1\/K18-1047"},{"key":"24_CR28","doi-asserted-by":"crossref","unstructured":"Parry, A., Fr\u00f6be, M., MacAvaney, S., Potthast, M., Hagen, M.: Analyzing adversarial attacks on sequence-to-sequence relevance models. In: European Conference on Information Retrieval, pp. 286\u2013302. Springer (2024)","DOI":"10.1007\/978-3-031-56060-6_19"},{"key":"24_CR29","doi-asserted-by":"crossref","unstructured":"Perez, E., et al.: Red teaming language models with language models. arXiv preprint arXiv:2202.03286 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.225"},{"key":"24_CR30","doi-asserted-by":"crossref","unstructured":"Raj, C., Mukherjee, A., Purohit, H., Anastasopoulos, A., Zhu, Z.: Salsa: salience-based switching attack for adversarial perturbations in fake news detection models. In: European Conference on Information Retrieval, pp. 35\u201349. Springer (2024)","DOI":"10.1007\/978-3-031-56069-9_3"},{"key":"24_CR31","doi-asserted-by":"crossref","first-page":"1408","DOI":"10.1162\/tacl_a_00434","volume":"9","author":"T Schick","year":"2021","unstructured":"Schick, T., Udupa, S., Sch\u00fctze, H.: Self-diagnosis and self-debiasing: a proposal for reducing corpus-based bias in NLP. Trans. Assoc. Comput. Linguist. 9, 1408\u20131424 (2021)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"24_CR32","doi-asserted-by":"crossref","unstructured":"Shah, D.S., Schwartz, H.A., Hovy, D.: Predictive biases in natural language processing models: a conceptual framework and overview. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 5248\u20135264 (2020)","DOI":"10.18653\/v1\/2020.acl-main.468"},{"key":"24_CR33","unstructured":"Shen, L., et al.: Textdefense: adversarial text detection based on word importance entropy. arXiv preprint arXiv:2302.05892 (2023)"},{"key":"24_CR34","doi-asserted-by":"crossref","unstructured":"Si, W.M., et al.: Why so toxic? Measuring and triggering toxic behavior in open-domain chatbots. In: Proceedings of the 2022 ACM SIGSAC Conference on Computer and Communications Security, CCS 2022, pp. 2659\u20132673. Association for Computing Machinery (2022)","DOI":"10.1145\/3548606.3560599"},{"key":"24_CR35","unstructured":"Sun, H., et al.: On the safety of conversational models: taxonomy, dataset, and benchmark. arXiv preprint arXiv:2110.08466 (2021)"},{"key":"24_CR36","unstructured":"Taori, R., et al.: Stanford Alpaca: An instruction-following LLaMA model (2023). https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"24_CR37","unstructured":"Thoppilan, R., et al.: LaMDA: language models for dialog applications. arXiv preprint arXiv:2201.08239 (2022)"},{"key":"24_CR38","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"24_CR39","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"24_CR40","doi-asserted-by":"crossref","unstructured":"Tsai, Y.T., Yang, M.C., Chen, H.Y.: Adversarial attack on sentiment classification. In: Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP, pp. 233\u2013240 (2019)","DOI":"10.18653\/v1\/W19-4824"},{"key":"24_CR41","doi-asserted-by":"crossref","unstructured":"Wallace, E., Feng, S., Kandpal, N., Gardner, M., Singh, S.: Universal adversarial triggers for attacking and analyzing NLP. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 2153\u20132162 (2019)","DOI":"10.18653\/v1\/D19-1221"},{"key":"24_CR42","doi-asserted-by":"crossref","first-page":"387","DOI":"10.1162\/tacl_a_00279","volume":"7","author":"E Wallace","year":"2019","unstructured":"Wallace, E., Rodriguez, P., Feng, S., Yamada, I., Boyd-Graber, J.: Trick me if you can: human-in-the-loop generation of adversarial examples for question answering. Trans. Assoc. Comput. Linguist. 7, 387\u2013401 (2019)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"24_CR43","doi-asserted-by":"crossref","first-page":"2526","DOI":"10.1109\/TASLP.2022.3192097","volume":"30","author":"J Wang","year":"2022","unstructured":"Wang, J., Bao, R., Zhang, Z., Zhao, H.: Rethinking textual adversarial defense for pre-trained language models. IEEE\/ACM Trans. Audio Speech Lang. Process. 30, 2526\u20132540 (2022)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"24_CR44","doi-asserted-by":"crossref","unstructured":"Yu, D., Sagae, K.: Automatically exposing problems with neural dialog models. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 456\u2013470 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.37"},{"key":"24_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, P.F., Huang, Z., Bai, G.: Universal adversarial perturbations for vision-language pre-trained models. In: Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 862\u2013871. Association for Computing Machinery (2024)","DOI":"10.1145\/3626772.3657781"},{"key":"24_CR46","unstructured":"Zhang, R., Yu, T., Shen, Y., Jin, H., Chen, C., Carin, L.: Reward constrained interactive recommendation with natural language feedback. In: NeurIPS (2020)"},{"issue":"3","key":"24_CR47","first-page":"1","volume":"11","author":"WE Zhang","year":"2020","unstructured":"Zhang, W.E., Sheng, Q.Z., Alhazmi, A., Li, C.: Adversarial attacks on deep-learning models in natural language processing: a survey. ACM Trans. Intell. Syst. Technol. 11(3), 1\u201341 (2020)","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"24_CR48","unstructured":"Zhang, Y.: Malevolent dialogue response detection and evaluation. Ph.D. thesis, University of Amsterdam (2022)"},{"issue":"12","key":"24_CR49","first-page":"1477","volume":"72","author":"Y Zhang","year":"2021","unstructured":"Zhang, Y., Ren, P., de Rijke, M.: A taxonomy, data set, and benchmark for detecting and classifying malevolent dialogue responses. J. Am. Soc. Inf. Sci. 72(12), 1477\u20131497 (2021)","journal-title":"J. Am. Soc. Inf. Sci."},{"key":"24_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: DIALOGPT: large-scale generative pre-training for conversational response generation. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations, pp. 270\u2013278 (2020)","DOI":"10.18653\/v1\/2020.acl-demos.30"},{"key":"24_CR51","unstructured":"Zhao, Z., Dua, D., Singh, S.: Generating natural adversarial examples. In: International Conference on Learning Representations (2018)"},{"key":"24_CR52","unstructured":"Ziebart, B.D., Maas, A.L., Bagnell, J.A., Dey, A.K.: Maximum entropy inverse reinforcement learning. In: Proceedings of the Twenty-Third AAAI Conference on Artificial Intelligence, AAAI 2008, Chicago, Illinois, USA, 13\u201317 July 2008, pp. 1433\u20131438 (2008)"}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88708-6_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T11:53:17Z","timestamp":1743767597000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88708-6_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031887079","9783031887086"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88708-6_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"3 April 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The code and the Unlabeled Dialogue Dataset used to obtain the results are available at . The binary classifier trained on the MDRDC dataset is available at . The MDRDC dataset is available at .","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Reproducibility"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lucca","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 April 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 April 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"47","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2025.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}